rm(list = ls())
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(cluster)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v purrr   0.3.4
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret) # advance package with more than 200 prediction models. 
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(naivebayes)
## naivebayes 0.9.7 loaded
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
library(parallel)
library(tictoc)
library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-3
library(leaflet)
library(kknn)
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
library(leaps)
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     rivers
library(skimr)
library(forcats)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(gbm)
## Loaded gbm 2.1.8
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
## 
##     compute
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following object is masked from 'package:dplyr':
## 
##     slice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart)
library(rpart.plot)

setwd("")
data = read.csv(file = "OnlineNewsPopularity.csv")

*I am going to use a useful tool from previous year, parallel programming, so that we can compute faster:

# Function to start the parallel programming
startParallel = function(model = "The model") {
  print(paste(model, "has started processing. Wait until it finishes."))
  cores = detectCores() - 1
  cl = makeCluster(cores)
  registerDoParallel(cl)
  return(cl)
} 
# Function to stop the parallel programming
endParallel = function(cl, model = "The model") {
  stopCluster(cl)
  print(paste(model, "has finished."))
}
summary(data)
##      url              timedelta     n_tokens_title n_tokens_content
##  Length:39644       Min.   :  8.0   Min.   : 2.0   Min.   :   0.0  
##  Class :character   1st Qu.:164.0   1st Qu.: 9.0   1st Qu.: 246.0  
##  Mode  :character   Median :339.0   Median :10.0   Median : 409.0  
##                     Mean   :354.5   Mean   :10.4   Mean   : 546.5  
##                     3rd Qu.:542.0   3rd Qu.:12.0   3rd Qu.: 716.0  
##                     Max.   :731.0   Max.   :23.0   Max.   :8474.0  
##  n_unique_tokens    n_non_stop_words    n_non_stop_unique_tokens
##  Min.   :  0.0000   Min.   :   0.0000   Min.   :  0.0000        
##  1st Qu.:  0.4709   1st Qu.:   1.0000   1st Qu.:  0.6257        
##  Median :  0.5392   Median :   1.0000   Median :  0.6905        
##  Mean   :  0.5482   Mean   :   0.9965   Mean   :  0.6892        
##  3rd Qu.:  0.6087   3rd Qu.:   1.0000   3rd Qu.:  0.7546        
##  Max.   :701.0000   Max.   :1042.0000   Max.   :650.0000        
##    num_hrefs      num_self_hrefs       num_imgs         num_videos   
##  Min.   :  0.00   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00  
##  1st Qu.:  4.00   1st Qu.:  1.000   1st Qu.:  1.000   1st Qu.: 0.00  
##  Median :  8.00   Median :  3.000   Median :  1.000   Median : 0.00  
##  Mean   : 10.88   Mean   :  3.294   Mean   :  4.544   Mean   : 1.25  
##  3rd Qu.: 14.00   3rd Qu.:  4.000   3rd Qu.:  4.000   3rd Qu.: 1.00  
##  Max.   :304.00   Max.   :116.000   Max.   :128.000   Max.   :91.00  
##  average_token_length  num_keywords    data_channel_is_lifestyle
##  Min.   :0.000        Min.   : 1.000   Min.   :0.00000          
##  1st Qu.:4.478        1st Qu.: 6.000   1st Qu.:0.00000          
##  Median :4.664        Median : 7.000   Median :0.00000          
##  Mean   :4.548        Mean   : 7.224   Mean   :0.05295          
##  3rd Qu.:4.855        3rd Qu.: 9.000   3rd Qu.:0.00000          
##  Max.   :8.042        Max.   :10.000   Max.   :1.00000          
##  data_channel_is_entertainment data_channel_is_bus data_channel_is_socmed
##  Min.   :0.000                 Min.   :0.0000      Min.   :0.0000        
##  1st Qu.:0.000                 1st Qu.:0.0000      1st Qu.:0.0000        
##  Median :0.000                 Median :0.0000      Median :0.0000        
##  Mean   :0.178                 Mean   :0.1579      Mean   :0.0586        
##  3rd Qu.:0.000                 3rd Qu.:0.0000      3rd Qu.:0.0000        
##  Max.   :1.000                 Max.   :1.0000      Max.   :1.0000        
##  data_channel_is_tech data_channel_is_world   kw_min_min       kw_max_min    
##  Min.   :0.0000       Min.   :0.0000        Min.   : -1.00   Min.   :     0  
##  1st Qu.:0.0000       1st Qu.:0.0000        1st Qu.: -1.00   1st Qu.:   445  
##  Median :0.0000       Median :0.0000        Median : -1.00   Median :   660  
##  Mean   :0.1853       Mean   :0.2126        Mean   : 26.11   Mean   :  1154  
##  3rd Qu.:0.0000       3rd Qu.:0.0000        3rd Qu.:  4.00   3rd Qu.:  1000  
##  Max.   :1.0000       Max.   :1.0000        Max.   :377.00   Max.   :298400  
##    kw_avg_min        kw_min_max       kw_max_max       kw_avg_max    
##  Min.   :   -1.0   Min.   :     0   Min.   :     0   Min.   :     0  
##  1st Qu.:  141.8   1st Qu.:     0   1st Qu.:843300   1st Qu.:172847  
##  Median :  235.5   Median :  1400   Median :843300   Median :244572  
##  Mean   :  312.4   Mean   : 13612   Mean   :752324   Mean   :259282  
##  3rd Qu.:  357.0   3rd Qu.:  7900   3rd Qu.:843300   3rd Qu.:330980  
##  Max.   :42827.9   Max.   :843300   Max.   :843300   Max.   :843300  
##    kw_min_avg     kw_max_avg       kw_avg_avg    self_reference_min_shares
##  Min.   :  -1   Min.   :     0   Min.   :    0   Min.   :     0           
##  1st Qu.:   0   1st Qu.:  3562   1st Qu.: 2382   1st Qu.:   639           
##  Median :1024   Median :  4356   Median : 2870   Median :  1200           
##  Mean   :1117   Mean   :  5657   Mean   : 3136   Mean   :  3999           
##  3rd Qu.:2057   3rd Qu.:  6020   3rd Qu.: 3600   3rd Qu.:  2600           
##  Max.   :3613   Max.   :298400   Max.   :43568   Max.   :843300           
##  self_reference_max_shares self_reference_avg_sharess weekday_is_monday
##  Min.   :     0            Min.   :     0.0           Min.   :0.000    
##  1st Qu.:  1100            1st Qu.:   981.2           1st Qu.:0.000    
##  Median :  2800            Median :  2200.0           Median :0.000    
##  Mean   : 10329            Mean   :  6401.7           Mean   :0.168    
##  3rd Qu.:  8000            3rd Qu.:  5200.0           3rd Qu.:0.000    
##  Max.   :843300            Max.   :843300.0           Max.   :1.000    
##  weekday_is_tuesday weekday_is_wednesday weekday_is_thursday weekday_is_friday
##  Min.   :0.0000     Min.   :0.0000       Min.   :0.0000      Min.   :0.0000   
##  1st Qu.:0.0000     1st Qu.:0.0000       1st Qu.:0.0000      1st Qu.:0.0000   
##  Median :0.0000     Median :0.0000       Median :0.0000      Median :0.0000   
##  Mean   :0.1864     Mean   :0.1875       Mean   :0.1833      Mean   :0.1438   
##  3rd Qu.:0.0000     3rd Qu.:0.0000       3rd Qu.:0.0000      3rd Qu.:0.0000   
##  Max.   :1.0000     Max.   :1.0000       Max.   :1.0000      Max.   :1.0000   
##  weekday_is_saturday weekday_is_sunday   is_weekend         LDA_00       
##  Min.   :0.00000     Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000     1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.02505  
##  Median :0.00000     Median :0.00000   Median :0.0000   Median :0.03339  
##  Mean   :0.06188     Mean   :0.06904   Mean   :0.1309   Mean   :0.18460  
##  3rd Qu.:0.00000     3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.24096  
##  Max.   :1.00000     Max.   :1.00000   Max.   :1.0000   Max.   :0.92699  
##      LDA_01            LDA_02            LDA_03            LDA_04       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.02501   1st Qu.:0.02857   1st Qu.:0.02857   1st Qu.:0.02857  
##  Median :0.03334   Median :0.04000   Median :0.04000   Median :0.04073  
##  Mean   :0.14126   Mean   :0.21632   Mean   :0.22377   Mean   :0.23403  
##  3rd Qu.:0.15083   3rd Qu.:0.33422   3rd Qu.:0.37576   3rd Qu.:0.39999  
##  Max.   :0.92595   Max.   :0.92000   Max.   :0.92653   Max.   :0.92719  
##  global_subjectivity global_sentiment_polarity global_rate_positive_words
##  Min.   :0.0000      Min.   :-0.39375          Min.   :0.00000           
##  1st Qu.:0.3962      1st Qu.: 0.05776          1st Qu.:0.02838           
##  Median :0.4535      Median : 0.11912          Median :0.03902           
##  Mean   :0.4434      Mean   : 0.11931          Mean   :0.03962           
##  3rd Qu.:0.5083      3rd Qu.: 0.17783          3rd Qu.:0.05028           
##  Max.   :1.0000      Max.   : 0.72784          Max.   :0.15549           
##  global_rate_negative_words rate_positive_words rate_negative_words
##  Min.   :0.000000           Min.   :0.0000      Min.   :0.0000     
##  1st Qu.:0.009615           1st Qu.:0.6000      1st Qu.:0.1852     
##  Median :0.015337           Median :0.7105      Median :0.2800     
##  Mean   :0.016612           Mean   :0.6822      Mean   :0.2879     
##  3rd Qu.:0.021739           3rd Qu.:0.8000      3rd Qu.:0.3846     
##  Max.   :0.184932           Max.   :1.0000      Max.   :1.0000     
##  avg_positive_polarity min_positive_polarity max_positive_polarity
##  Min.   :0.0000        Min.   :0.00000       Min.   :0.0000       
##  1st Qu.:0.3062        1st Qu.:0.05000       1st Qu.:0.6000       
##  Median :0.3588        Median :0.10000       Median :0.8000       
##  Mean   :0.3538        Mean   :0.09545       Mean   :0.7567       
##  3rd Qu.:0.4114        3rd Qu.:0.10000       3rd Qu.:1.0000       
##  Max.   :1.0000        Max.   :1.00000       Max.   :1.0000       
##  avg_negative_polarity min_negative_polarity max_negative_polarity
##  Min.   :-1.0000       Min.   :-1.0000       Min.   :-1.0000      
##  1st Qu.:-0.3284       1st Qu.:-0.7000       1st Qu.:-0.1250      
##  Median :-0.2533       Median :-0.5000       Median :-0.1000      
##  Mean   :-0.2595       Mean   :-0.5219       Mean   :-0.1075      
##  3rd Qu.:-0.1869       3rd Qu.:-0.3000       3rd Qu.:-0.0500      
##  Max.   : 0.0000       Max.   : 0.0000       Max.   : 0.0000      
##  title_subjectivity title_sentiment_polarity abs_title_subjectivity
##  Min.   :0.0000     Min.   :-1.00000         Min.   :0.0000        
##  1st Qu.:0.0000     1st Qu.: 0.00000         1st Qu.:0.1667        
##  Median :0.1500     Median : 0.00000         Median :0.5000        
##  Mean   :0.2824     Mean   : 0.07143         Mean   :0.3418        
##  3rd Qu.:0.5000     3rd Qu.: 0.15000         3rd Qu.:0.5000        
##  Max.   :1.0000     Max.   : 1.00000         Max.   :0.5000        
##  abs_title_sentiment_polarity     shares      
##  Min.   :0.0000               Min.   :     1  
##  1st Qu.:0.0000               1st Qu.:   946  
##  Median :0.0000               Median :  1400  
##  Mean   :0.1561               Mean   :  3395  
##  3rd Qu.:0.2500               3rd Qu.:  2800  
##  Max.   :1.0000               Max.   :843300

Data Set Original Source:

K. Fernandes, P. Vinagre and P. Cortez. A Proactive Intelligent Decision Support System for Predicting the Popularity of Online News. Proceedings of the 17th EPIA 2015 - Portuguese Conference on Artificial Intelligence, September, Coimbra, Portugal

Introduction

This data set summarizes many features about published articles during two years. The target is to predict the number of shares of an article in social media.

We are gonna be capable of predicting by using classification and advanced regression tools the number of shares and to do so, we will use categorical and numerical variables. The implementation of 2 testing sets, one for training and testing and other one to obtain the truly final performance of the tools presented.

Variables Interpretation

-URL represents the direction of the article.

-Timedelta the days passed between publication and data collection.

-N_Tokens_Title is the number of words in title.

-N_Unique_Tokens is the number of unique words.

-Num_Hrefs for the number of links.

-Data_Channel_Is_XXXXX for the type of content that is published.

-Kw referes to the worst and best keywords depending on the number of shares.

-Self_Reference_ are the shares of Referenced articles, maximum, minimum and the average.

-Weekday refers to what day of the week is the article published on.

-Is_Weekend whether the article was published or not on weekend.

-Lda remarks the closeness to a LDA Topic.

-Global_Subjectivity, Global_Sentiment_Polarity, Global_Rate, represents respectively the world rate of words in content, such as subjectivity, sentiment or possitive and negative ratings.

-Rate_Positive_Words / Rate_Negative_Words represents the amount of negative and possitive rating amont tokens that are non-neutral.

-Polarity variables are related to Positive and Negative statistics of the polarity.

-Title variables define how subjective or polar a title is, and Abs_Title defines it in absolute value, without positivity or negativity.

-Shares, the last variable is the most important one because is the one we are going to predict and it is the number of times a source is shared on the media.

Data Preprocessing

First of all, let´s find possible outliers.

hist(rowMeans(is.na(data)), main = "Histogram of the completed data", xlab = "Proportion of missing values", ylab = "Number of rows")

hist(colMeans(is.na(data)), main = "Histogram of the completed data", xlab = "Proportion of missing values", ylab = "N-Columns")

#No missing values found at all, it seems the data is pretty accurate. 

In order to remove noise, we are going to mix some columns into one, to reduce computing time and noise.

Let´s group all Weeks columns into just one:

data_original = data
nrows = nrow(data)
data$weekday = NA


for(i in 1:nrows){
  
  if(data$weekday_is_monday[i] == 1){
    data$weekday[i] = "Monday"
  }
  else if(data$weekday_is_tuesday[i] == 1){
    data$weekday[i] = "Tuesday"
  }
  else if(data$weekday_is_wednesday[i] == 1){
    data$weekday[i] = "Wednesday"
   }
  else if(data$weekday_is_thursday[i] == 1){
    data$weekday[i] = "Thursday"
   }
  else if(data$weekday_is_friday[i] == 1){
    data$weekday[i] = "Friday"
   }
  else if(data$weekday_is_saturday[i] == 1){
    data$weekday[i] = "Saturday"
   }
  else if(data$weekday_is_sunday[i] == 1){
    data$weekday[i] = "Sunday"
  }
}

Now let´s apply the same process but for data channel types:

data$channel = NA

for(i in 1:nrows){
  
  if(data$data_channel_is_bus[i] == 1){
    data$channel[i] = "Business"
  }
  else if(data$data_channel_is_entertainment[i] == 1){
    data$channel[i] = "Entertainment"
  }
  else if(data$data_channel_is_lifestyle[i] == 1){
    data$channel[i] = "Lifestyle"
   }
  else if(data$data_channel_is_socmed[i] == 1){
    data$channel[i] = "Social-Media"
   }
  else if(data$data_channel_is_tech[i] == 1){
    data$channel[i] = "Tech"
   } 
  else if(data$data_channel_is_world[i] == 1){
    data$channel[i] = "World"
  }
  else{ data$channel[i] ="Other"}
}

Now I am going to get rid of the columns that are no longer useful as we have already collected all its information. Also I will remove a few columns that I consider that are not important for our purposes:

removeCols = function(colsVector, dataset){
  ix <- which(colnames(dataset) %in% colsVector) #I obtain the number of a column name
  return(dataset[, -ix])   # I am selecting the names of the variables i want to eliminate. 
}

colsVector = c("weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday", "weekday_is_saturday", "weekday_is_sunday", "data_channel_is_world", "data_channel_is_tech", "data_channel_is_socmed", "data_channel_is_lifestyle", "data_channel_is_entertainment", "data_channel_is_bus", "url")
#Vector with the column names i decide to eliminate. 

data = removeCols(colsVector, data) #I am using the function removeCols to delete them from the data. 


ggcorr(data, label=F)
## Warning in ggcorr(data, label = F): data in column(s) 'weekday', 'channel' are
## not numeric and were ignored

We can see that this is a really good data set as no missing values have been found and most of the data is quite relevant. Thanks to the ggcorr plot we can see also that the presented variables have no collinearity that for some supervised learning tools may be a good result. So, seeing the corrplot, there is no need to remove any quite correlated variables.

Data Creation for Classification:

First, we need to create a categorical variable that we can predict, in order to do so, I have decided to create 3 categories: FewLikes, MidLikes and LotLikes, that will separate the data set in 3 parts, and the size of each part depend on the threshold I select. Let´s do it:

data_csf = data #I duplicate the data set and apply changes to it. 

data_csf$shares = factor(ifelse(data_csf$shares <= 1000, "FewLikes", ifelse(data_csf$shares <= 2100, "MidLikes", "LotLikes")))
data_csf$shares <- factor(data_csf$shares, levels = c("FewLikes", "MidLikes", "LotLikes"))

levels(data_csf$shares)
## [1] "FewLikes" "MidLikes" "LotLikes"
summary(data$shares)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     946    1400    3395    2800  843300
summary(data_csf$shares)
## FewLikes MidLikes LotLikes 
##    12424    14265    12955
a = length(which(data$shares <= 1000)); b = length(which(data$shares <= 2100)) ; c = length(which(data$shares > 2500))
a; b-a; c; sum(b + c)
## [1] 12424
## [1] 14265
## [1] 10866
## [1] 37555
#This piece of code is just for checking that the partition has been done correctly. 

#data_csf$shares = as.numeric(data_csf$shares)

Here we have created a classification data set, grouping in the 3 categories using factor function, so that the number of shares is spread.

Data Splitting

splitting_csf = createDataPartition(data_csf$shares, p = 0.8, list = FALSE)

dataTrain_csf = data_csf[splitting_csf,]
dataTest_csf = data_csf[-splitting_csf,]

y.train = data_csf[splitting_csf,]$shares
y.test = data_csf[-splitting_csf, ]$shares

We have split the data into 3 parts. One for the Final testing as we have a lot of samples (20%). The other part (80%) has been divided into 2 different parts: Training and Testing with proportion of 70% and 30% respectively. Moreover I have split the data classification set to be able to compute classification apart from regression with this data set.

Also, we need to take into account that those profiles with a huge number of shares, ie, outliers, are going to be inside the LotLikes category, so, by the moment, we need to care because we are considering also the outliers for the classification set, we will deal in a few lines with the ones on regression, that are easier to deal with.

Visualization tools

Now, I am going to try to see and understand, the classification data set where the only difference is that the classification has the predicted variable as a categorical one. Let´s see a bar plot with each of the levels:

ggplot(data=dataTrain_csf) + aes(x = shares) + geom_bar(fill = "darkgreen") + ggtitle("Total shares")

p = ggplot(dataTrain_csf, aes(x=shares,fill = channel)) + geom_bar(); ggplotly(p)
p = ggplot(dataTrain_csf, aes(x=channel,fill = shares)) + geom_bar(); ggplotly(p)
p = ggplot(dataTrain_csf, aes(x=shares,fill = weekday)) + geom_bar(); ggplotly(p)
p = ggplot(dataTrain_csf, aes(x=weekday,fill = shares)) + geom_bar(); ggplotly(p)
data_csf$weekday = as.numeric(as.factor(data_csf$weekday))
data_csf$channel = as.numeric(as.factor(data_csf$channel))

Here we can see the same 2 plots in a different way, I personally like more the first one as it is more visual. Here we can choose the day of the week and the channel of the media and see that each category is more or less spread all about them.

We are going to apply lot of tools in regression for previsualization, later on. ## Linear Discriminant Analysis; LDA

Now, I am creating just the model for the LDA:

model_lda = lda(shares ~ ., data=dataTrain_csf, prior = c(1/3, 1/3, 1/3)); model_lda
## Warning in lda.default(x, grouping, ...): variables are collinear
## Call:
## lda(shares ~ ., data = dataTrain_csf, prior = c(1/3, 1/3, 1/3))
## 
## Prior probabilities of groups:
##  FewLikes  MidLikes  LotLikes 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##          timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes  345.6901       10.53028         517.7591       0.5410529
## MidLikes  350.8482       10.38854         545.8244       0.5297094
## LotLikes  365.4581       10.29670         573.9718       0.5890351
##          n_non_stop_words n_non_stop_unique_tokens num_hrefs num_self_hrefs
## FewLikes        0.9742455                0.6861185  9.532696       3.027968
## MidLikes        0.9725727                0.6729847 10.720908       3.338766
## LotLikes        1.0637785                0.7228117 12.335295       3.407275
##          num_imgs num_videos average_token_length num_keywords kw_min_min
## FewLikes 3.820121   1.210262             4.579605     7.069316   21.53169
## MidLikes 4.353049   1.183141             4.560364     7.209954   28.48589
## LotLikes 5.431880   1.347356             4.501834     7.395311   28.69973
##          kw_max_min kw_avg_min kw_min_max kw_max_max kw_avg_max kw_min_avg
## FewLikes   1048.457   285.3763   11877.88   761038.8   255398.4   955.0905
## MidLikes   1103.878   307.4499   13457.96   748518.4   255362.8  1114.1738
## LotLikes   1300.295   341.7952   14739.67   747817.5   266309.3  1267.1326
##          kw_max_avg kw_avg_avg self_reference_min_shares
## FewLikes   5161.108   2864.241                  2816.275
## MidLikes   5371.899   3051.414                  3747.274
## LotLikes   6344.925   3462.573                  5590.497
##          self_reference_max_shares self_reference_avg_sharess is_weekend
## FewLikes                  7083.399                   4461.428 0.05492958
## MidLikes                  9883.619                   6084.898 0.14887837
## LotLikes                 14032.695                   8823.294 0.18159012
##             LDA_00    LDA_01    LDA_02    LDA_03    LDA_04 global_subjectivity
## FewLikes 0.1631171 0.1670079 0.2767935 0.2043417 0.1887398           0.4338808
## MidLikes 0.1897776 0.1356326 0.2152267 0.2020751 0.2572880           0.4427988
## LotLikes 0.2021912 0.1242539 0.1596759 0.2616896 0.2520929           0.4533785
##          global_sentiment_polarity global_rate_positive_words
## FewLikes                 0.1105689                 0.03822679
## MidLikes                 0.1214890                 0.03975512
## LotLikes                 0.1261304                 0.04085099
##          global_rate_negative_words rate_positive_words rate_negative_words
## FewLikes                 0.01697033           0.6720537           0.3020912
## MidLikes                 0.01633222           0.6877758           0.2846217
## LotLikes                 0.01650785           0.6861405           0.2770011
##          avg_positive_polarity min_positive_polarity max_positive_polarity
## FewLikes             0.3506987            0.09800050             0.7450717
## MidLikes             0.3536707            0.09507566             0.7563305
## LotLikes             0.3588554            0.09440743             0.7700489
##          avg_negative_polarity min_negative_polarity max_negative_polarity
## FewLikes            -0.2585912            -0.5205257            -0.1074286
## MidLikes            -0.2557215            -0.5145062            -0.1069595
## LotLikes            -0.2638996            -0.5307727            -0.1080979
##          title_subjectivity title_sentiment_polarity abs_title_subjectivity
## FewLikes          0.2729206               0.05483263              0.3402783
## MidLikes          0.2749049               0.07156305              0.3414546
## LotLikes          0.3017967               0.08799310              0.3427411
##          abs_title_sentiment_polarity weekdayMonday weekdaySaturday
## FewLikes                    0.1460433     0.1792757      0.02323944
## MidLikes                    0.1516071     0.1638626      0.06983877
## LotLikes                    0.1695116     0.1604593      0.09002316
##          weekdaySunday weekdayThursday weekdayTuesday weekdayWednesday
## FewLikes    0.03169014       0.2049296      0.2060362        0.2197183
## MidLikes    0.07903961       0.1809499      0.1850683        0.1735892
## LotLikes    0.09156696       0.1677924      0.1697221        0.1723273
##          channelEntertainment channelLifestyle channelOther channelSocial-Media
## FewLikes            0.2386318       0.03812877    0.1162978          0.02022133
## MidLikes            0.1637750       0.05608132    0.1288118          0.06466877
## LotLikes            0.1324778       0.06348900    0.2171941          0.08722501
##          channelTech channelWorld
## FewLikes   0.1237425    0.3011066
## MidLikes   0.2109183    0.2075009
## LotLikes   0.2170976    0.1358549
## 
## Coefficients of linear discriminants:
##                                        LD1           LD2
## timedelta                     1.462389e-05  2.178162e-03
## n_tokens_title                2.234114e-03  1.794558e-02
## n_tokens_content              1.856687e-04  5.127276e-04
## n_unique_tokens              -1.705518e-01  9.237163e-01
## n_non_stop_words              7.328564e-01 -1.397540e+00
## n_non_stop_unique_tokens     -9.835254e-01  1.258208e+00
## num_hrefs                     1.069591e-02  4.955552e-03
## num_self_hrefs               -2.390610e-02 -2.903602e-02
## num_imgs                      1.749186e-03  9.469290e-03
## num_videos                   -1.027378e-04 -1.735870e-02
## average_token_length         -1.529912e-01 -4.644799e-01
## num_keywords                  4.665224e-02  3.078499e-02
## kw_min_min                    2.385898e-03 -6.159271e-03
## kw_max_min                    4.721541e-05  7.136488e-05
## kw_avg_min                   -3.139828e-04 -5.722489e-04
## kw_min_max                   -5.235777e-07 -1.563456e-07
## kw_max_max                   -3.378508e-07 -6.005659e-07
## kw_avg_max                   -6.953823e-07 -6.371711e-07
## kw_min_avg                   -1.095410e-04 -2.205608e-04
## kw_max_avg                   -1.259629e-04 -6.721628e-05
## kw_avg_avg                    9.623514e-04  6.816364e-04
## self_reference_min_shares     4.766393e-08  5.654143e-06
## self_reference_max_shares     7.561956e-07  1.388984e-06
## self_reference_avg_sharess    3.443757e-06 -2.798286e-06
## is_weekend                    4.698710e-01 -3.700727e-01
## LDA_00                        9.177644e-01  1.109198e+00
## LDA_01                       -4.472565e-01  1.511612e-01
## LDA_02                       -6.551381e-01  1.195464e-01
## LDA_03                       -3.621915e-01  2.039814e-01
## LDA_04                        1.847968e-01 -7.854685e-01
## global_subjectivity           9.145843e-01 -3.963933e-01
## global_sentiment_polarity    -3.360282e-01 -8.746186e-01
## global_rate_positive_words   -1.347302e+00  5.001473e+00
## global_rate_negative_words    3.995810e+00 -7.193399e+00
## rate_positive_words           3.619145e-01  1.687126e+00
## rate_negative_words          -3.106400e-01  2.157279e+00
## avg_positive_polarity         1.614009e-01  7.037249e-02
## min_positive_polarity        -4.482421e-01  3.840755e-01
## max_positive_polarity        -1.960804e-01  1.680489e-01
## avg_negative_polarity        -5.083738e-01 -8.376589e-01
## min_negative_polarity         7.656287e-02  6.527369e-02
## max_negative_polarity         3.156968e-01  6.501577e-01
## title_subjectivity            1.749175e-01  3.826049e-01
## title_sentiment_polarity      2.176893e-01  1.477612e-01
## abs_title_subjectivity        3.853358e-01  4.507512e-01
## abs_title_sentiment_polarity -3.731572e-02 -4.871748e-02
## weekdayMonday                -1.709968e-01  3.505963e-01
## weekdaySaturday               5.010216e-01 -1.753058e-01
## weekdaySunday                 3.682952e-01 -4.894109e-01
## weekdayThursday              -3.329585e-01  2.812499e-01
## weekdayTuesday               -3.405809e-01  2.318281e-01
## weekdayWednesday             -3.804344e-01  6.741112e-01
## channelEntertainment         -1.001509e-01  1.068488e+00
## channelLifestyle              2.972539e-01  5.490102e-01
## channelOther                  3.714482e-01  1.374427e+00
## channelSocial-Media           1.371061e+00 -3.747689e-02
## channelTech                   1.027005e+00  6.447557e-01
## channelWorld                  3.179010e-01  1.309428e+00
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9314 0.0686
model_lda = lda(shares ~ ., data=dataTrain_csf); model_lda
## Warning in lda.default(x, grouping, ...): variables are collinear
## Call:
## lda(shares ~ ., data = dataTrain_csf)
## 
## Prior probabilities of groups:
##  FewLikes  MidLikes  LotLikes 
## 0.3134065 0.3598184 0.3267751 
## 
## Group means:
##          timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes  345.6901       10.53028         517.7591       0.5410529
## MidLikes  350.8482       10.38854         545.8244       0.5297094
## LotLikes  365.4581       10.29670         573.9718       0.5890351
##          n_non_stop_words n_non_stop_unique_tokens num_hrefs num_self_hrefs
## FewLikes        0.9742455                0.6861185  9.532696       3.027968
## MidLikes        0.9725727                0.6729847 10.720908       3.338766
## LotLikes        1.0637785                0.7228117 12.335295       3.407275
##          num_imgs num_videos average_token_length num_keywords kw_min_min
## FewLikes 3.820121   1.210262             4.579605     7.069316   21.53169
## MidLikes 4.353049   1.183141             4.560364     7.209954   28.48589
## LotLikes 5.431880   1.347356             4.501834     7.395311   28.69973
##          kw_max_min kw_avg_min kw_min_max kw_max_max kw_avg_max kw_min_avg
## FewLikes   1048.457   285.3763   11877.88   761038.8   255398.4   955.0905
## MidLikes   1103.878   307.4499   13457.96   748518.4   255362.8  1114.1738
## LotLikes   1300.295   341.7952   14739.67   747817.5   266309.3  1267.1326
##          kw_max_avg kw_avg_avg self_reference_min_shares
## FewLikes   5161.108   2864.241                  2816.275
## MidLikes   5371.899   3051.414                  3747.274
## LotLikes   6344.925   3462.573                  5590.497
##          self_reference_max_shares self_reference_avg_sharess is_weekend
## FewLikes                  7083.399                   4461.428 0.05492958
## MidLikes                  9883.619                   6084.898 0.14887837
## LotLikes                 14032.695                   8823.294 0.18159012
##             LDA_00    LDA_01    LDA_02    LDA_03    LDA_04 global_subjectivity
## FewLikes 0.1631171 0.1670079 0.2767935 0.2043417 0.1887398           0.4338808
## MidLikes 0.1897776 0.1356326 0.2152267 0.2020751 0.2572880           0.4427988
## LotLikes 0.2021912 0.1242539 0.1596759 0.2616896 0.2520929           0.4533785
##          global_sentiment_polarity global_rate_positive_words
## FewLikes                 0.1105689                 0.03822679
## MidLikes                 0.1214890                 0.03975512
## LotLikes                 0.1261304                 0.04085099
##          global_rate_negative_words rate_positive_words rate_negative_words
## FewLikes                 0.01697033           0.6720537           0.3020912
## MidLikes                 0.01633222           0.6877758           0.2846217
## LotLikes                 0.01650785           0.6861405           0.2770011
##          avg_positive_polarity min_positive_polarity max_positive_polarity
## FewLikes             0.3506987            0.09800050             0.7450717
## MidLikes             0.3536707            0.09507566             0.7563305
## LotLikes             0.3588554            0.09440743             0.7700489
##          avg_negative_polarity min_negative_polarity max_negative_polarity
## FewLikes            -0.2585912            -0.5205257            -0.1074286
## MidLikes            -0.2557215            -0.5145062            -0.1069595
## LotLikes            -0.2638996            -0.5307727            -0.1080979
##          title_subjectivity title_sentiment_polarity abs_title_subjectivity
## FewLikes          0.2729206               0.05483263              0.3402783
## MidLikes          0.2749049               0.07156305              0.3414546
## LotLikes          0.3017967               0.08799310              0.3427411
##          abs_title_sentiment_polarity weekdayMonday weekdaySaturday
## FewLikes                    0.1460433     0.1792757      0.02323944
## MidLikes                    0.1516071     0.1638626      0.06983877
## LotLikes                    0.1695116     0.1604593      0.09002316
##          weekdaySunday weekdayThursday weekdayTuesday weekdayWednesday
## FewLikes    0.03169014       0.2049296      0.2060362        0.2197183
## MidLikes    0.07903961       0.1809499      0.1850683        0.1735892
## LotLikes    0.09156696       0.1677924      0.1697221        0.1723273
##          channelEntertainment channelLifestyle channelOther channelSocial-Media
## FewLikes            0.2386318       0.03812877    0.1162978          0.02022133
## MidLikes            0.1637750       0.05608132    0.1288118          0.06466877
## LotLikes            0.1324778       0.06348900    0.2171941          0.08722501
##          channelTech channelWorld
## FewLikes   0.1237425    0.3011066
## MidLikes   0.2109183    0.2075009
## LotLikes   0.2170976    0.1358549
## 
## Coefficients of linear discriminants:
##                                        LD1           LD2
## timedelta                     1.975240e-05  2.178121e-03
## n_tokens_title                2.276361e-03  1.794027e-02
## n_tokens_content              1.868754e-04  5.122890e-04
## n_unique_tokens              -1.683764e-01  9.241153e-01
## n_non_stop_words              7.295638e-01 -1.399262e+00
## n_non_stop_unique_tokens     -9.805602e-01  1.260520e+00
## num_hrefs                     1.070754e-02  4.930354e-03
## num_self_hrefs               -2.397440e-02 -2.897965e-02
## num_imgs                      1.771477e-03  9.465145e-03
## num_videos                   -1.436091e-04 -1.735841e-02
## average_token_length         -1.540844e-01 -4.641184e-01
## num_keywords                  4.672459e-02  3.067506e-02
## kw_min_min                    2.371389e-03 -6.164872e-03
## kw_max_min                    4.738331e-05  7.125351e-05
## kw_avg_min                   -3.153293e-04 -5.715080e-04
## kw_min_max                   -5.239444e-07 -1.551124e-07
## kw_max_max                   -3.392639e-07 -5.997688e-07
## kw_avg_max                   -6.968806e-07 -6.355320e-07
## kw_min_avg                   -1.100600e-04 -2.203022e-04
## kw_max_avg                   -1.261208e-04 -6.691951e-05
## kw_avg_avg                    9.639537e-04  6.793686e-04
## self_reference_min_shares     6.097665e-08  5.654015e-06
## self_reference_max_shares     7.594639e-07  1.387199e-06
## self_reference_avg_sharess    3.437159e-06 -2.806387e-06
## is_weekend                    4.689983e-01 -3.711780e-01
## LDA_00                        9.203735e-01  1.107034e+00
## LDA_01                       -4.468994e-01  1.522139e-01
## LDA_02                       -6.548548e-01  1.210886e-01
## LDA_03                       -3.617102e-01  2.048337e-01
## LDA_04                        1.829469e-01 -7.859014e-01
## global_subjectivity           9.136484e-01 -3.985456e-01
## global_sentiment_polarity    -3.380866e-01 -8.738250e-01
## global_rate_positive_words   -1.335522e+00  5.004632e+00
## global_rate_negative_words    3.978861e+00 -7.202787e+00
## rate_positive_words           3.658858e-01  1.686269e+00
## rate_negative_words          -3.055598e-01  2.158004e+00
## avg_positive_polarity         1.615662e-01  6.999228e-02
## min_positive_polarity        -4.473366e-01  3.851298e-01
## max_positive_polarity        -1.956842e-01  1.685101e-01
## avg_negative_polarity        -5.103446e-01 -8.364596e-01
## min_negative_polarity         7.671634e-02  6.509324e-02
## max_negative_polarity         3.172267e-01  6.494126e-01
## title_subjectivity            1.758179e-01  3.821920e-01
## title_sentiment_polarity      2.180366e-01  1.472483e-01
## abs_title_subjectivity        3.863961e-01  4.498426e-01
## abs_title_sentiment_polarity -3.743032e-02 -4.862949e-02
## weekdayMonday                -1.701708e-01  3.509980e-01
## weekdaySaturday               5.006074e-01 -1.764850e-01
## weekdaySunday                 3.671418e-01 -4.902767e-01
## weekdayThursday              -3.322954e-01  2.820331e-01
## weekdayTuesday               -3.400341e-01  2.326293e-01
## weekdayWednesday             -3.788462e-01  6.750051e-01
## channelEntertainment         -9.763482e-02  1.068721e+00
## channelLifestyle              2.985458e-01  5.483088e-01
## channelOther                  3.746833e-01  1.373549e+00
## channelSocial-Media           1.370969e+00 -4.070499e-02
## channelTech                   1.028520e+00  6.423358e-01
## channelWorld                  3.209832e-01  1.308676e+00
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9262 0.0738

Variable prior is not mandatory but it is important because introduces 3 probabilities, with the proportion of shares pi1 = Few, pi2 = Mid, pi3 = Lot; Prior is the proportion of shares in each category, we are just guessing by checking on the data set. We are finally removing it, as for this data set, the proportion is more or less the same. These are the hyper parameters because they belong to the future that you don´t know.

Now we are going to predict and compute the future probabilities:

probability = predict(model_lda, newdata=dataTest_csf)$posterior
head(probability)
##     FewLikes  MidLikes   LotLikes
## 3  0.5489891 0.3494641 0.10154681
## 7  0.5421282 0.3502987 0.10757309
## 13 0.5994757 0.2579056 0.14261865
## 14 0.7389681 0.1914193 0.06961263
## 18 0.6042526 0.2881476 0.10759982
## 20 0.4388105 0.3963282 0.16486122

As we are using machine learning to predict and this is a bayesian tool; the output is a probability, choosing the maximum probability is mathematically perfect, but if we were a company this will be very expensive. As we are not, let´s do it:

prediction = max.col(probability); head(prediction) 
## [1] 1 1 1 1 1 1
prediction = predict(model_lda, newdata=dataTest_csf)$class; head(prediction)
## [1] FewLikes FewLikes FewLikes FewLikes FewLikes FewLikes
## Levels: FewLikes MidLikes LotLikes

We have chosen the column with highest probability, and as we obtain the split in numbers and we want categories, we use the testing set.

LDA performance

The confusion matrix: predictions in rows, true values in columns (but we can change the order) For us to know the performance of the LDA model, we need to test it and check the so called confusion matrix, that is the matrix that contains the predictions in the Testing set, the true values.

confusionMatrix(prediction, dataTest_csf$shares)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     1424      903      554
##   MidLikes      678     1078      799
##   LotLikes      382      872     1238
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4717          
##                  95% CI : (0.4607, 0.4828)
##     No Information Rate : 0.3599          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2085          
##                                           
##  Mcnemar's Test P-Value : 2.05e-14        
## 
## Statistics by Class:
## 
##                      Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity                   0.5733          0.3778          0.4778
## Specificity                   0.7324          0.7090          0.7650
## Pos Pred Value                0.4943          0.4219          0.4968
## Neg Pred Value                0.7900          0.6696          0.7511
## Prevalence                    0.3133          0.3599          0.3268
## Detection Rate                0.1796          0.1360          0.1562
## Detection Prevalence          0.3634          0.3223          0.3143
## Balanced Accuracy             0.6528          0.5434          0.6214
confusionMatrix(prediction, dataTest_csf$shares)$table
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     1424      903      554
##   MidLikes      678     1078      799
##   LotLikes      382      872     1238
confusionMatrix(prediction, dataTest_csf$shares)$overall[1]
##  Accuracy 
## 0.4717457

If we were doing regression, we would compute the R square (will do it later), but as we are not, we are going to obtain the confusion error. We can see that the kappa is around 0.19 and the accuracy is around 48%, which is not bad at all because this model was so easy to compute and cheap but it is definitely not accurate.

It is important to take into account that for been an LDA model, the accuracy is quite high due to the fact that the data split is highly balanced. Nevertheless, I am going to try a different approach.

Quadratic Discriminant Analysis: QDA

Let´s apply same tools as in LDA but with a different model approach for the moment: First, I need to re-split

summary(dataTrain_csf$shares)
## FewLikes MidLikes LotLikes 
##     9940    11412    10364
splitting_qda = createDataPartition(data_csf$shares, p = 0.85, list = FALSE)

dataTrain_qda = data_csf[splitting_qda,]
dataTest_qda = data_csf[-splitting_qda,]

summary(dataTrain_qda$shares)
## FewLikes MidLikes LotLikes 
##    10561    12126    11012
dataTrain_qda = dataTrain_qda[,-c(6,7,8,9,10,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27)] 
#I am removing the less possible variables just to avoid collinearity in the model

model_qda <- qda(shares ~ ., data=dataTrain_qda); model_qda
## Call:
## qda(shares ~ ., data = dataTrain_qda)
## 
## Prior probabilities of groups:
##  FewLikes  MidLikes  LotLikes 
## 0.3133921 0.3598326 0.3267753 
## 
## Group means:
##          timedelta n_tokens_title n_tokens_content n_unique_tokens
## FewLikes  347.5032       10.52457         519.0209       0.5410393
## MidLikes  350.2134       10.39048         548.7354       0.5288285
## LotLikes  366.6715       10.30104         574.5031       0.5222045
##          n_non_stop_words average_token_length num_keywords    LDA_02    LDA_03
## FewLikes        0.9744342             4.582809     7.063062 0.2766769 0.2071301
## MidLikes        0.9720435             4.557338     7.208972 0.2148112 0.2044536
## LotLikes        0.9646749             4.507581     7.396749 0.1585813 0.2637528
##             LDA_04 global_subjectivity global_sentiment_polarity
## FewLikes 0.1891222           0.4333829                 0.1099478
## MidLikes 0.2552931           0.4433531                 0.1218988
## LotLikes 0.2513853           0.4545307                 0.1261176
##          global_rate_positive_words global_rate_negative_words
## FewLikes                 0.03818540                 0.01710310
## MidLikes                 0.03989204                 0.01637471
## LotLikes                 0.04079279                 0.01653743
##          rate_positive_words rate_negative_words avg_positive_polarity
## FewLikes           0.6705364           0.3038031             0.3504094
## MidLikes           0.6876688           0.2842098             0.3534693
## LotLikes           0.6866771           0.2779070             0.3588756
##          min_positive_polarity max_positive_polarity avg_negative_polarity
## FewLikes            0.09815644             0.7440926            -0.2587443
## MidLikes            0.09463095             0.7575850            -0.2560930
## LotLikes            0.09392458             0.7703931            -0.2648767
##          min_negative_polarity max_negative_polarity title_subjectivity
## FewLikes            -0.5210734            -0.1069238          0.2677329
## MidLikes            -0.5161442            -0.1064693          0.2746782
## LotLikes            -0.5321845            -0.1089502          0.3019028
##          title_sentiment_polarity abs_title_subjectivity
## FewLikes               0.05377237              0.3404780
## MidLikes               0.07024472              0.3421620
## LotLikes               0.08814128              0.3427182
##          abs_title_sentiment_polarity  weekday  channel
## FewLikes                    0.1439285 4.475144 4.164947
## MidLikes                    0.1511973 4.234290 4.208808
## LotLikes                    0.1711601 4.177534 4.145932

We can find many issues with this model. The minimum size on each category so that the model works is 10000 rows, that´s the reason why I have used 90% for the training test just for this approach, depending on how the split is done, this piece of code could result in a error, rank deficiency, cause by what I have just explained. Also, collinearity breaks this model, so it is important to have just a few important columns. We are going to choose the main principal ones.

This is one of the reasons why reducing noise is so important. I am keeping many of the columns just because in some models overfitting is something good to predict, but not to understand what is going on.

Performance:

prediction = predict(model_qda, newdata=dataTest_qda)$class

confusionMatrix(prediction, dataTest_qda$shares)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     1084      942      616
##   MidLikes      310      559      519
##   LotLikes      469      638      808
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4123          
##                  95% CI : (0.3997, 0.4249)
##     No Information Rate : 0.3598          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1247          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity                   0.5819         0.26134          0.4159
## Specificity                   0.6183         0.78219          0.7234
## Pos Pred Value                0.4103         0.40274          0.4219
## Neg Pred Value                0.7642         0.65328          0.7184
## Prevalence                    0.3134         0.35980          0.3268
## Detection Rate                0.1823         0.09403          0.1359
## Detection Prevalence          0.4444         0.23347          0.3221
## Balanced Accuracy             0.6001         0.52176          0.5696
confusionMatrix(prediction, dataTest_qda$shares)$table
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     1084      942      616
##   MidLikes      310      559      519
##   LotLikes      469      638      808
confusionMatrix(prediction, dataTest_qda$shares)$overall[1]
##  Accuracy 
## 0.4122792

Accuracy seems to be so poor now, kappa has been reduced to 0.1 and accuracy does not achieve the 39%. No improvement with respect to LDA.

A benchmark model

What is the accuracy on a benchmark model that predicts the most frequent outcome (No Delay) for all observations? Benchmark models basically predict on a basis of what is the most frequent output, it is a cheap choice, so let´s try it:

table(dataTrain_csf$shares)
## 
## FewLikes MidLikes LotLikes 
##     9940    11412    10364
obs = max(table(dataTest_csf$shares))
accuracy = obs/nrow(dataTest_csf); accuracy
## [1] 0.3598638

Accuracy rounds 36%, that is good if we compare it with the 39% of the QDA and also compare the time consumed. It is not so usefull to predict as 2 of each 3 times, the prediction will be wrong.

We are obtaining pretty low accuracy results, I am going to try other different models and aproaches that will provide us better results, let´s use them:

Naive Bayes

As we know, in naive bayes, there is Standard and Gaussian classification, where the first assumes that the predictor variables are independent and the second, the Gaussian, does not assume and it is given the target class. I am going first with the Gaussian approach:

Standard version (Gaussian)

model_nb0 = naiveBayes(as.matrix(dataTrain_csf), y.train, laplace = 0.6) # laplace controls smoothing of probabilities
prediction = predict(model_nb0, as.matrix(dataTest_csf)) 

We are assuming that the columns are independent and there is going to be a very low varianciace but bias is huge.

Let´s see the results in the confusion Matrix:

confusionMatrix(prediction, y.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     2478       14       19
##   MidLikes        1     2839        4
##   LotLikes        5        0     2568
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9946          
##                  95% CI : (0.9927, 0.9961)
##     No Information Rate : 0.3599          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9919          
##                                           
##  Mcnemar's Test P-Value : 3.28e-05        
## 
## Statistics by Class:
## 
##                      Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity                   0.9976          0.9951          0.9911
## Specificity                   0.9939          0.9990          0.9991
## Pos Pred Value                0.9869          0.9982          0.9981
## Neg Pred Value                0.9989          0.9972          0.9957
## Prevalence                    0.3133          0.3599          0.3268
## Detection Rate                0.3126          0.3581          0.3239
## Detection Prevalence          0.3167          0.3587          0.3245
## Balanced Accuracy             0.9958          0.9971          0.9951
confusionMatrix(prediction, y.test)$overall[1]
##  Accuracy 
## 0.9945762

Accuracy seems to be 99.4% what is an insane result. But kappa is also too high, 98%. The fact that the accuracy is that high makes me suspicious and not to trust that much that value, but it seems to be a really good result, just for comparison, let´s try the probabilities as an output:

probability = predict(model_nb0, as.matrix(dataTest_csf),type="raw")

head(probability)
##          FewLikes     MidLikes     LotLikes
## [1,] 1.383183e-04 9.998558e-01 5.919158e-06
## [2,] 9.999964e-01 1.138425e-06 2.440958e-06
## [3,] 9.999346e-01 9.226965e-07 6.445266e-05
## [4,] 3.349156e-05 1.883209e-06 9.999646e-01
## [5,] 2.160561e-03 5.608863e-05 9.977834e-01
## [6,] 3.355196e-04 1.143374e-05 9.996530e-01
hist(probability)

This values are quite extreme (most close to 0 or 1) so need to take care with them also.

We need to check kappa, that is the accuracy but discounting the accuracy of a naive classifier, your tool must have an accuracy greater than the Kappa number, that is the expected accuracy since a naive approach, a really basic one. We can see that this tool as it is providing a 99% is really good.

As kappa is quite high, we can expect the accuracy to be also really high.

I have also tried to compute Bernoulli naive-bayes approximation and multinomial but I have not been capable of computing them due to many different errors. Let´s use more advanced tools:

Caret

Thanks to caret, a package in R, we can try more than 200 different models in an easy way, using cross validation (cv) and multiple core processing. We will use now caret a lot, but let´s start by using it to predict with naive Bayes:

Firstly, here we define the parameters that are going to go in the control variable:

ctrl = trainControl(method = "repeatedcv", # Here we can use bootstrap, LOOCV and many others. 
                     repeats = 2, # repeat cross-validation 5 times just to be sure. 
                     number = 3, # kfolds
                    allowParallel = TRUE, 
                    verboseIter = T)

We must take care as caret does not have implemented the multinomial naive Bayes. So, instead of 3 categories, we are going to do just binomial, this is why we have transformed our data set into binary:

trctrl = trainControl(method = "none")

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
nb_grid = expand.grid(usekernel = c(TRUE, FALSE),
                         laplace = seq(0, 1, 0.1), 
                         adjust = seq(0.5, 1.5, 0.1))

nb_mod = train(x = dataTrain_csf, #train means estimate, fit or optimize, is all the same. X is where is training set
                y = y.train,
                method = "naive_bayes", # This is the only thing that changes. You just type the name of the machine                                            learning tool .
                trControl = ctrl, # control can be: do or do not do cross-validation (cv) for instance.
                tuneGrid = nb_grid,
                num.threads = (detectCores() - 1))
## Aggregating results
## Selecting tuning parameters
## Fitting laplace = 0.1, usekernel = TRUE, adjust = 1.5 on full training set
## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded

## Warning: In density.default(x, na.rm = TRUE, ...) :
##  extra argument 'num.threads' will be disregarded
nb_pred = predict(nb_mod,
                   newdata = dataTest_csf) # This piece of code is always the same for all the models. 

confusionMatrix(nb_pred,y.test)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FewLikes MidLikes LotLikes
##   FewLikes     2469        1        7
##   MidLikes        2     2816        2
##   LotLikes       13       36     2582
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9923          
##                  95% CI : (0.9901, 0.9941)
##     No Information Rate : 0.3599          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9884          
##                                           
##  Mcnemar's Test P-Value : 3.999e-07       
## 
## Statistics by Class:
## 
##                      Class: FewLikes Class: MidLikes Class: LotLikes
## Sensitivity                   0.9940          0.9870          0.9965
## Specificity                   0.9985          0.9992          0.9908
## Pos Pred Value                0.9968          0.9986          0.9814
## Neg Pred Value                0.9972          0.9928          0.9983
## Prevalence                    0.3133          0.3599          0.3268
## Detection Rate                0.3114          0.3552          0.3257
## Detection Prevalence          0.3124          0.3557          0.3319
## Balanced Accuracy             0.9962          0.9931          0.9937
confusionMatrix(nb_pred,y.test)$overall[1]
##  Accuracy 
## 0.9923058
# plot(nb_mod)

toc()
## 90.11 sec elapsed
endParallel(cl)
## [1] "The model has finished."
plot(nb_mod)

plot(confusionMatrix(nb_pred,y.test)[["table"]])

Aggregating results
Selecting tuning parameters
Fitting laplace = 0.1, usekernel = TRUE, adjust = 1.5 on full training set

So, as a result, we can see that the accuracy as before is extremely high, because it is around 99.2%. We have use by first time in this project, caret package, using a training control, doing cross validation and different hyper parameters, but we did not obtain a so high accuracy with that, in comparison with the previous result in naive bayes, but we need to take into account that the accuracy is more than 99%, so improve would be almost impossible.

Logist Regression

In order to use logistic regression with the glm package, our categorical target must be binomial, something like yes or no, so we need to transform the Low, Mid and High number of shares into Low and High number of shares. The problem of this in relation with our data is that we will get only information about if the number of shares is going to be above or below our threshold (of 1400) so it is not going to be so usefull, despite this, we are using it:

data_csf_bin = data
data_csf_bin$shares = factor(ifelse(data_csf_bin$shares <= 1400, "FewLikes", "LotLikes"))

splitting_csf_bin = createDataPartition(data_csf_bin$shares, p = 0.8, list = FALSE)

dataTrain_csf_bin = data_csf_bin[splitting_csf_bin,]
dataTest_csf_bin = data_csf_bin[-splitting_csf_bin,]

Now I am creating the Logistic Regression model:

model_lgr = glm(shares ~., family = binomial(link="logit"),data=dataTrain_csf_bin); summary(model_lgr) 
## 
## Call:
## glm(formula = shares ~ ., family = binomial(link = "logit"), 
##     data = dataTrain_csf_bin)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7319  -1.0349  -0.6274   1.0700   2.1890  
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   4.592e+02  1.334e+03   0.344 0.730573    
## timedelta                     1.129e-04  8.232e-05   1.372 0.170066    
## n_tokens_title               -2.074e-03  6.048e-03  -0.343 0.731679    
## n_tokens_content              1.331e-04  4.813e-05   2.766 0.005681 ** 
## n_unique_tokens              -2.519e-01  4.018e-01  -0.627 0.530626    
## n_non_stop_words              1.421e-02  1.283e+00   0.011 0.991159    
## n_non_stop_unique_tokens     -4.438e-01  3.403e-01  -1.304 0.192211    
## num_hrefs                     7.731e-03  1.466e-03   5.275 1.33e-07 ***
## num_self_hrefs               -2.478e-02  3.815e-03  -6.494 8.35e-11 ***
## num_imgs                      2.401e-03  1.911e-03   1.256 0.209010    
## num_videos                    2.840e-03  3.336e-03   0.851 0.394548    
## average_token_length         -1.266e-01  5.040e-02  -2.511 0.012027 *  
## num_keywords                  4.206e-02  7.772e-03   5.412 6.23e-08 ***
## kw_min_min                    1.664e-03  3.451e-04   4.823 1.42e-06 ***
## kw_max_min                    3.487e-05  1.589e-05   2.194 0.028207 *  
## kw_avg_min                   -2.703e-04  1.032e-04  -2.619 0.008812 ** 
## kw_min_max                   -4.408e-07  2.409e-07  -1.830 0.067246 .  
## kw_max_max                   -3.938e-07  1.241e-07  -3.172 0.001514 ** 
## kw_avg_max                   -5.160e-07  1.782e-07  -2.896 0.003779 ** 
## kw_min_avg                   -9.007e-05  1.613e-05  -5.585 2.34e-08 ***
## kw_max_avg                   -8.599e-05  5.549e-06 -15.496  < 2e-16 ***
## kw_avg_avg                    7.127e-04  3.223e-05  22.110  < 2e-16 ***
## self_reference_min_shares    -8.963e-07  1.960e-06  -0.457 0.647513    
## self_reference_max_shares     6.542e-08  9.561e-07   0.068 0.945448    
## self_reference_avg_sharess    4.499e-06  2.552e-06   1.763 0.077942 .  
## is_weekend                    5.150e-01  5.702e-02   9.032  < 2e-16 ***
## LDA_00                       -4.603e+02  1.334e+03  -0.345 0.729960    
## LDA_01                       -4.614e+02  1.334e+03  -0.346 0.729338    
## LDA_02                       -4.616e+02  1.334e+03  -0.346 0.729226    
## LDA_03                       -4.614e+02  1.334e+03  -0.346 0.729315    
## LDA_04                       -4.609e+02  1.334e+03  -0.346 0.729616    
## global_subjectivity           9.171e-01  1.771e-01   5.179 2.23e-07 ***
## global_sentiment_polarity    -1.249e-02  3.492e-01  -0.036 0.971461    
## global_rate_positive_words   -2.295e+00  1.483e+00  -1.548 0.121716    
## global_rate_negative_words    1.536e+00  2.881e+00   0.533 0.593948    
## rate_positive_words           7.927e-01  1.255e+00   0.631 0.527783    
## rate_negative_words           5.077e-01  1.265e+00   0.401 0.688222    
## avg_positive_polarity        -3.372e-01  2.858e-01  -1.180 0.238035    
## min_positive_polarity        -3.690e-01  2.377e-01  -1.553 0.120501    
## max_positive_polarity        -1.906e-02  8.959e-02  -0.213 0.831499    
## avg_negative_polarity        -5.242e-02  2.635e-01  -0.199 0.842287    
## min_negative_polarity         3.333e-02  9.633e-02   0.346 0.729390    
## max_negative_polarity         4.258e-02  2.169e-01   0.196 0.844354    
## title_subjectivity            1.871e-01  5.712e-02   3.275 0.001057 ** 
## title_sentiment_polarity      2.244e-01  5.230e-02   4.291 1.78e-05 ***
## abs_title_subjectivity        3.463e-01  7.607e-02   4.553 5.29e-06 ***
## abs_title_sentiment_polarity -4.949e-02  8.256e-02  -0.599 0.548859    
## weekdayMonday                -1.296e-01  4.284e-02  -3.025 0.002486 ** 
## weekdaySaturday               2.611e-01  7.037e-02   3.710 0.000207 ***
## weekdaySunday                        NA         NA      NA       NA    
## weekdayThursday              -2.050e-01  4.204e-02  -4.876 1.08e-06 ***
## weekdayTuesday               -2.708e-01  4.194e-02  -6.457 1.07e-10 ***
## weekdayWednesday             -2.847e-01  4.197e-02  -6.784 1.17e-11 ***
## channelEntertainment         -3.785e-02  7.634e-02  -0.496 0.620077    
## channelLifestyle              1.209e-01  7.763e-02   1.558 0.119305    
## channelOther                  2.582e-01  8.013e-02   3.222 0.001272 ** 
## channelSocial-Media           1.065e+00  6.964e-02  15.293  < 2e-16 ***
## channelTech                   7.422e-01  6.941e-02  10.692  < 2e-16 ***
## channelWorld                  2.958e-01  7.477e-02   3.955 7.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43962  on 31715  degrees of freedom
## Residual deviance: 39811  on 31658  degrees of freedom
## AIC: 39927
## 
## Number of Fisher Scoring iterations: 8

In logistic regression, as before, we can predict using probabilities:

probability = predict(model_lgr, newdata = dataTest_csf_bin, type='response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(probability)
##         2         5         9        11        13        15 
## 0.2196621 0.3118688 0.3353655 0.1984133 0.2535278 0.1826584
prop = seq(0.1, 1, 0.1)
accuracy = expand.grid(prop, 0)
for(i in 1:length(prop)){
  prediction1 = as.factor(ifelse(probability > prop[i], "FewLikes", "LotLikes"))
  accuracy[i, 2] = confusionMatrix(prediction1, dataTest_csf_bin$shares)$overall[1]

}
## Warning in confusionMatrix.default(prediction1, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
accuracy_f = accuracy[which.max(accuracy[,2]),]; accuracy_f
##   Var1      Var2
## 1  0.1 0.5064329
> accuracy_f = accuracy[which.max(accuracy[,2]),]; accuracy_f
  Prob     Acc 
1  0.1 0.506559

So, up to now, we have been capable of getting a 35% accuracy while saying if a post will have a few likes or a lot of likes, something that should be easy, but the results seem to be not so good at all since kappa also rounds 30% too.

But, when we tune the probability as a hyperparameter we can see that for probability greater than 0.1 we are getting an accuracy of 50.6%, quite better that for the 50% in p. This is because we are saying that being in the category few likes is going be done if p is greater than 0.1 what classifies most of them and improves accuracy, that´s why if we choose p to be zero, it increases more, but this number is not useful/realistic for the number of shares that a post will receive.

Penalized logistic regression

When the dimension is high, it is better to use a penalized version. The most known is glmnet or elasticnet.

prop = seq(0.1, 1, 0.1)
alpha = seq(0, 0.1, 0.01)
lambda = seq(0, 0.01, 0.001)
accuracy = expand.grid(prop, alpha, lambda, 0)

for(i in 1:nrow(accuracy)){
  model_logit = glmnet((dataTrain_csf_bin[,-40]),dataTrain_csf_bin$shares, 
                     family=c("binomial"), alpha=accuracy[i,2], lambda=accuracy[i,3])

  probability2 = predict(model_logit, (data.matrix(dataTest_csf_bin[,-40])), type='response')

  prediction2 = as.factor(ifelse(probability2 > accuracy[i,1], "FewLikes", "LotLikes"))
  accuracy[i, 4] = confusionMatrix(prediction2, dataTest_csf_bin$shares)$overall[1]

}
## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(prediction2, dataTest_csf_bin$shares): Levels
## are not in the same order for reference and data. Refactoring data to match.
accuracy_f = accuracy[which.max(accuracy[,4]),]; accuracy_f
##    Var1 Var2 Var3     Var4
## 10    1    0    0 0.493441
>      Var1   Var2    Var3       Var4
> 621   0.1   0.07    0.005     0.5066852

We are obtaining that the best parameters are probability = 0.1, alpha = 0.07, lambda = 0.005 and and accuracy of 50%. We get this values as before, when we set the probability as the lowest one. In comparion with the naive bayes approach, this method is definitely not valid for this data set. If we consider that we get 50% because of choosing the lowest probability, this afirmation will be even stronger.

The ROC curve

model = lda(shares ~ ., data=dataTrain_csf_bin,  prior = c(.1, .9))
## Warning in lda.default(x, grouping, ...): variables are collinear

AS I want that the ones with higher shares represent the highest part of the sample, I set the category HighShares to 90%, so the threshold will be high

I am getting the probabilities and computing the ROC Curve:

probability = predict(model, dataTest_csf_bin)$posterior

roc.lda <- roc(dataTest_csf_bin$shares,probability[,2])
## Setting levels: control = FewLikes, case = LotLikes
## Setting direction: controls < cases
auc(roc.lda) 
## Area under the curve: 0.7
plot.roc(dataTest_csf_bin$shares, probability[,2],col="darkblue", print.auc = TRUE,  auc.polygon=TRUE, grid=c(0.1, 0.2),
         grid.col=c("green", "red"), max.auc.polygon=TRUE,
         auc.polygon.col="lightblue", print.thres=TRUE)
## Setting levels: control = FewLikes, case = LotLikes
## Setting direction: controls < cases

We know that the closer the curve to the top left corner, the better. The optimal threshold is also provided, 2.6% for this case. The area under the curve is 0.7022, so 70% and the values for the specificity are 0.652 and 0.63 for the sensitivity.

Remember that the sensitivity is the positive rate and 1 minus the false positive rate is the specificity. The threshold is 89%, not so good.

We are going to analyze how good it would be for us if we were a company that wants to promote a post and get as much shares as possible; ie, optimize their own publicity and know when will be better to publish our advertisements, this is why we need to add a cost.

Assume the following (company’s data):

  • Cost of true negatives is 0: the desired number of likes is achieved.

  • Cost of false negatives is 1000: This means that we think that our post is going to be seeing, but it is not and we are going to lose much money. For each bad advertisement we pay 1000 dolars.

  • Cost of false positives is 200: If our post is obtaining the desired shares, we increase a bit the money designated to advertisement so that we continue.

  • Cost of true positives is 140: (1 - gamma) * 1000 + gamma * 200 = 150 + 170 = 320 where gamma = probability an advert is well positioned, 0.85 in our case

Cost matrix:

Prediction/Reference no yes
predicted no 0 1000
predicted yes 200 320
cost.unit <- c(0, 200, 1000, 320)

Cost of Naive classifier

Unit cost for Naive classifier (no analytics knowledge)

cost = 0*.99 + 200*0 + 1000*.01 + + 320*0 = 1 eur/share on average

For this dataset, as the naive bayes works really good with 99% accuracy, the cost will be so small.

Savings = (post increase incentive - cost)

in the case of the naive classifier, the savings = 199 eur/post, i.e. a big gain. As the rest of the models have demostrated a smaller accuracy, we are not going to consider them.

So, if we make 10000 posts we could be gaining around 2.000.000 euros in investing advertising, which will conclude with lots of shares and promotion for any post that we promote / advertise.

Let´s now continue using again caret to try classical machine learning approaches applied to classification:

Rpart, Decision trees

library(caret)

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
model = shares ~.
control = rpart.control(minsplit = 40, maxdepth = 12, cp=0.001)
dtFit <- rpart(model, data=dataTrain_csf_bin, method = "class", control = control)


caret.fit <- train(model, 
                   data = dataTrain_csf_bin, 
                   method = "rpart",
                   control = control,
                   trControl = trainControl(method = "cv", number = 10),
                   tuneLength=10)
caret.fit
## CART 
## 
## 31716 samples
##    48 predictor
##     2 classes: 'FewLikes', 'LotLikes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 28545, 28545, 28544, 28545, 28544, 28544, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.002257721  0.6404343  0.2805730
##   0.002428115  0.6401821  0.2801055
##   0.003546326  0.6353580  0.2713267
##   0.003642173  0.6333715  0.2672693
##   0.004664537  0.6246380  0.2500406
##   0.005015974  0.6229038  0.2467485
##   0.008051118  0.6171652  0.2350191
##   0.013546326  0.6126567  0.2247119
##   0.034760383  0.5943370  0.1885777
##   0.177827476  0.5635016  0.1233526
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.002257721.
endParallel(cl);
## [1] "The model has finished."
toc()
## 20.78 sec elapsed

Now we are predicting in the binary categorical set, and the accuracy is not bad at all: We are getting that the optimal value for the complexity parameter is cp = 0.002811502; The accuracy is almost 63% and kappa is just 27%. If we improve the hyper parameters we will probably get better results.

Visualization of the tree:

library(rpart.plot)
rpart.plot(caret.fit$finalModel)

Prediction

dtProb <- predict(caret.fit, dataTest_csf_bin, type = "prob")
threshold = 0.4
dtPred = rep("FewLikes", nrow(dataTest_csf_bin))
dtPred[which(dtProb[,2] > threshold)] = "LotLikes"
summary(dtPred)
##    Length     Class      Mode 
##      7928 character character
CM = confusionMatrix(table(dtPred,  dataTest_csf_bin$shares))$table

cost = sum(as.vector(CM)*cost.unit)/sum((CM)); cost
## [1] 296.7255

Here thanks to the previous analysis we are computing the cost, positive by the way, that will take each of the posts that we as a company intend to promote, the closer the threshold is to 0.5, as we increase the accuracy, the cost is reduced, if we increase much the probability, the cost will also be increased.

Random Forest

Now, we are gonna compute the cost but using random forests:

This function that we have used in class, alows us to compute the cost fastest, avoiding time consume:

EconomicCost <- function(data, lev = NULL, model = NULL) 
{
  y.pred = data$pred 
  y.true = data$obs
  CM = confusionMatrix(y.pred, y.true)$table
  out = sum(as.vector(CM)*(c(0, 200, 1000, 320)))/sum(CM)
  names(out) <- c("EconomicCost")
  out
}

Now include this function in the Caret control:

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
ctrl <- trainControl(method = "cv", number = 5,
                     classProbs = TRUE, 
                     summaryFunction = EconomicCost,
                     verboseIter=T,
                     allowParallel = TRUE)

rf.train <- train(shares ~., 
                  method = "rf", 
                  data = dataTrain_csf_bin,
                  preProcess = c("center", "scale"),
                  ntree = 200,
                  cutoff=c(0.7,0.3),
                  tuneGrid = expand.grid(mtry=c(8,10,15)), 
                  metric = "EconomicCost",
                  maximize = F,
                  trControl = ctrl)
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 8 on full training set
endParallel(cl)
## [1] "The model has finished."
toc()
## 215.61 sec elapsed

We can improve a lot the accuracy of this random forest by increasing the number of cross validations, cv, and the number of minimum trees on each forest, that is the mtry parameter. This is probably with gradient boosting one of the tools that is more time consuming. Despite this I have tried to improve it a bit and it is possible thanks to parallel processing, that saves much time.

Variable importance:

rf_imp <- varImp(rf.train, scale = F)
plot(rf_imp, scales = list(y = list(cex = .95)))

This part does not seem, but it is probably one of the most important ones for our company. Here we can see which variables are more important for our cost on the data set. With difference, the first ones are kw_avg_avg and kw_max_avg. Both are the average value of shares and the maximum value of shares that a post receives and obviously, this ones are so important due to the fact, that depending on the number of mean shares that there are, the more or less shares our post will take in that website.

Prediction:

rfPred = predict(rf.train, newdata=dataTest_csf_bin)
CM = confusionMatrix(table(rfPred,  dataTest_csf_bin$shares))$table
cost = sum(as.vector(CM)*cost.unit)/sum(CM)
cost
## [1] 254.4349
confusionMatrix(table(rfPred,  dataTest_csf_bin$shares))$overall[1]
##  Accuracy 
## 0.5881685

We are obtaining again a cost that is positive and that rounds 255 euros per post, what keeps being really good. Note that the accuracy is 60%, not as good as naive bayes, but pretty decent.

Gradient Boosting

xgb_grid = expand.grid(
  nrounds = c(500,1000),
  eta = c(0.01, 0.001), # c(0.01,0.05,0.1)
  max_depth = c(2, 4, 6),
  gamma = 1,
  colsample_bytree = c(0.2, 0.4),
  min_child_weight = c(1,5),
  subsample = 1
)

Then, train

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
ctrl <- trainControl(method = "cv", number = 2,
                     classProbs = TRUE, 
                     summaryFunction = EconomicCost,
                     verboseIter=T)

xgb.train = train(shares ~ .,  data=dataTrain_csf_bin,
                  trControl = ctrl,
                  metric="EconomicCost",
                  maximize = F,
                  tuneGrid = xgb_grid,
                  preProcess = c("center", "scale"),
                  method = "xgbTree"
)
## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 1000, max_depth = 6, eta = 0.01, gamma = 1, colsample_bytree = 0.2, min_child_weight = 5, subsample = 1 on full training set
endParallel(cl)
## [1] "The model has finished."
toc()
## 195.15 sec elapsed

With gradient boosting, the most time consumer model that we know up to now, and with the few parameters that we have tried, I have obtained the following fit values: Fitting nrounds = 500, max_depth = 6, eta = 0.01, gamma = 1, colsample_bytree = 0.2, min_child_weight = 5, subsample = 1 on full training set

Variable importance:

xgb_imp <- varImp(xgb.train, scale = F)
plot(xgb_imp, scales = list(y = list(cex = .95)))

Most important variables keep been more or less the same ones, Prediction and cost

threshold = 0.2
xgbProb = predict(xgb.train, newdata=dataTest_csf_bin, type="prob")
xgbPred = rep("FewLikes", nrow(dataTest_csf_bin))
xgbPred[which(xgbProb[,2] > threshold)] = "LotLikes"
CM = confusionMatrix(table(xgbPred,  dataTest_csf_bin$shares))$table

cost = sum(as.vector(CM)*cost.unit)/sum(CM); cost
## [1] 253.9102
confusionMatrix(table(xgbPred,  dataTest_csf_bin$shares))$overall[1]
##  Accuracy 
## 0.5417508

As in rf, we get 256 euros/post in Cost and 51% of accuracy, I keep insisting that if we had more resources, upgrading the tunning grid will provide us more and more accuracy, probably up to a 20% increase.

Ensemble learning: combine the best classifiers and form a meta-classifier

Now we are going to compute a really intelligent movement that is: instead of keeping a model that performs better, let´s try to compute or keep the 3 best ones and get their average. In practice thanks to this easy step, we get a global better accuracy because thanks to mixing, the model fits better in new data, in future data and we can get higher performances.

mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

Prediction

ensemble.pred = apply(data.frame(dtPred, xgbPred, rfPred), 1, mode) 
CM1 = confusionMatrix(factor(ensemble.pred), dataTest_csf_bin$shares)$table
cost = sum(as.vector(CM)*cost.unit)/sum(CM1); cost
## [1] 253.9102

We can see that the output is 256 euros/post, that is the average of all the models.

Classification Final Conclusions

It has been a curious thing to compute classification approach with this data set. It may seem in some cases not too relevant because this data originally was made for regression, but I think that we have been capable of getting really convenient insights and at the end I realized that the data was also cool for this type of supervised learning.

We know for which channels or days or the weeks, the number of shares in our posts will be higher, we know what variables are most important depending on the model approach that we use and a really cool thing is that since an economic point of view, we have been capable of guessing how will work this data if we were a company that wants to public something in social media or any other channel, advertise it and earn money with it.

Naive Bayes has been the best model with many difference but the fact that accuracy and kappa are that high does not seem to be a really good think, maybe over fitting is related to that. I have decided to use this many variables, around 40, because many models work well with it, but probably if we spend more time removing a few and configuring well the hyper parameters, our results would be better.

Regression Approach

splitting = createDataPartition(data$shares, p = 0.9, list = FALSE)  # 80% for training and 20% for testing

dataModels = data[splitting,]
dataFinalTesting = data[-splitting,] #Only for regression, as variables are not transformed. 


splitting_2 = createDataPartition(dataModels$shares, p = 0.7, list = FALSE)

dataTrain = dataModels[splitting_2,]
dataTest = dataModels[-splitting_2,]

Now, let´s apply some visualization tools, for the numeric data, we are going to see similar results but applied with numbers:

ggplot(data=dataTrain) + aes(x = shares) + geom_histogram(fill = "darkgreen") + ggtitle("Total shares")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can see that there are many asymmetric variables and we can just see very extreme values. Most of the posts receive about 1400 shares, and they oscillate between 1000 and 3000, we can see that there are also some outliers, let´s see it on a boxplot:

Outliers

summary(dataTrain$shares)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       5     946    1400    3372    2800  843300
ggplot(data=dataTrain) + aes(y = shares) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares")

ggplot(data=dataTrain) + aes(y = shares) + ylim(0,10000) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares")
## Warning: Removed 1359 rows containing non-finite values (stat_boxplot).

dataTrain<-dataTrain[!(dataTrain$shares >= 3500),] #removing rows with shares greater than 5000

ggplot(data=dataTrain) + aes(y = shares) + geom_boxplot(fill = "darkgreen") + ggtitle("Total shares")

We have needed to implement the ylim in order to see something because there are some post with almost a million shares, that in comparison with the average and the median is a clear outlier. I am going to remove all the posts with more than 3500 shares by the moment, as many of them keep being outliers but are more or less close to the third quartile.

Let´s see a density plot now that corroborates the previous information:

ggplot(data=dataTrain) + aes(x = shares) + geom_density(fill = "darkgreen") + ggtitle("Total shares")

ggplot(data=dataTrain) + aes(x = log(shares + 10^4)) + geom_density(fill = "darkgreen") + ggtitle("Total shares distribution")

I have applied also the logarithm in order to see it more linearly, also adding some shares to the x axis, so we can see better the end of the curve.

Let´s analyze how the day of the week influences on the number of shares

ggplot(dataTrain, aes(fill = weekday, y = shares)) + geom_boxplot() 

p = ggplot(dataTrain)+aes(x=shares, fill = weekday) + geom_density(alpha = 0.5)  + ggtitle("Shares depending on the day of the week"); ggplotly(p)

This two plots are really useful, by clicking on each day of the week, we can see the number of shares on each day, and as expected, on the weekend the number of shares is higher, this is due to the over activity during the days of free time, where people is staying with the family or enjoying their free their time, which means checking on the media any posts or comments.

Now let´s focus on the kind of articles that receive more likes/shares:

ggplot(dataTrain, aes(fill = channel, y = shares)) + geom_boxplot() 

p = ggplot(dataTrain)+aes(x=shares, fill = channel)+geom_density(alpha = 0.5) + ggtitle("Shares depending on the channel"); ggplotly(p)

World theme, follow by Entertainment are the least shared channels, while Social-Media and Others (any subcathegories such as Lifestyle) are the highest shared ones, again not by surprise. This means that the most shared posts are the ones people share in their profiles, instead the ones that are related to the news, the real world and people´s entertainment, that have a higher density but a quite lower number of shares.

Let´s see if can relate or see something in relation with numerical variables:

p = ggplot(dataTrain) + aes(x = average_token_length, y = self_reference_avg_sharess, color = channel)+
  geom_point(size = 2); ggplotly(p)
p = ggplot(dataTrain) + aes(x = average_token_length, y = self_reference_avg_sharess, color = weekday)+
  geom_point(size = 2); ggplotly(p)

As before, we can see that if we click and separate in the plot depending on weekday or channel, social media and other, have the highest average number of shares. There is not a huge difference, but if we check carefully it can be appreciated.

First, let´s check which are the variables that present a higher correlation with the target variable, “shares”:

dataTrain$channel = as.numeric(as.factor(dataTrain$channel))
dataTrain$weekday = as.numeric(as.factor(dataTrain$weekday))

dataTest$channel = as.numeric(as.factor(dataTest$channel))
dataTest$weekday = as.numeric(as.factor(dataTest$weekday))

corr_delay = sort(cor(dataTrain[,c(1:47)])["shares",], decreasing = T) # I am skipping the last 3 categorical values. 

corr = data.frame(corr_delay)

p = ggplot(corr,aes(x = row.names(corr), y = corr_delay)) + geom_bar(stat = "identity", fill = "darkgreen") + 
  scale_x_discrete(limits= row.names(corr)) +  labs(x = "", y = "Shares", title = "Correlations between the variables and Shares obtained") +
  theme(plot.title = element_text(hjust = 0), axis.text.x = element_text(angle = 45, hjust = 1)); ggplotly(p)

We are including also the variable shares, whose correlation to itself is obviously 1, so that if we don´t put it and we scale, visually we are going to think that kw_avg_avg variable has a really high correlation when it is just around 0.08. From here, we can also see that the variable LDA_02 is negatively correlated to Shares in a High value.

“kw_avg_avg” and “is_weekend” seem to be the most correlated variables.

Nevertheless, let´s make some zoom in and check better the correlations:

corr_shares <- sort(cor(dataTrain[,c(21, 29, 24, 22, 23,20, 40 ,28, 48, 49, 47)])["shares",], decreasing = T)

corr=data.frame(corr_shares)

p=ggplot(corr,aes(x = row.names(corr), y = corr_shares)) + 
  geom_bar(stat = "identity", fill = "darkgreen") + 
  scale_x_discrete(limits= row.names(corr)) +
  labs(x = "Predictors", y = "Shares", title = "Correlations of the Variables with Shares") + 
  theme(plot.title = element_text(hjust = 0, size = rel(1.5)),
        axis.text.x = element_text(angle = 45, hjust = 1)); ggplotly(p)

When can see, that in this data set, correlation between variables is quite high, we saw it at the beginning in the correlations matrix, so this is not a surprise. There will be almost no collinearity, but it does not mean that we do not have noise because having many variables as we do can end up with overfiting, for some models that is good. (for the ones where we only care about predicting with good R^2).

First of all, let´s compute the easiest model we could, simple regression:

linFit <- lm((shares) ~ kw_avg_avg, data=dataTrain); summary(linFit)
## 
## Call:
## lm(formula = (shares) ~ kw_avg_avg, data = dataTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3165.7  -520.7  -188.3   383.2  2120.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.173e+03  1.357e+01   86.40   <2e-16 ***
## kw_avg_avg  7.598e-02  4.190e-03   18.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 708.2 on 20066 degrees of freedom
## Multiple R-squared:  0.01612,    Adjusted R-squared:  0.01607 
## F-statistic: 328.8 on 1 and 20066 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(linFit, pch=23 ,bg='orange',cex=2)

kw_avg_avg: Are the keyword average shares.

We can see that R2 is 2%, what is really really bad. This is as expected, because as we use a simple linear model, the R2, that is the percentage of variability explained, that allow us to measure the accuracy or quality of a model, is going to be the even lower than the correlation between both variables. We need to include more parameters to estimate our results.

We can say that the relation between shares and kw_avg_avg (average shares on keyword) is non-linear, so we need to use a non-linear tool, because we can see that for linear ones, the performance is bad. Insights?

As in classification, let´s apply regression but introducing the logarithm:

linFit <- lm(log(shares) ~kw_avg_avg, data=dataTrain); summary(linFit) 
## 
## Call:
## lm(formula = log(shares) ~ kw_avg_avg, data = dataTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4527 -0.3390 -0.0149  0.3703  1.1313 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.940e+00  1.009e-02  687.82   <2e-16 ***
## kw_avg_avg  5.822e-05  3.115e-06   18.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5264 on 20066 degrees of freedom
## Multiple R-squared:  0.01712,    Adjusted R-squared:  0.01707 
## F-statistic: 349.5 on 1 and 20066 DF,  p-value: < 2.2e-16

We can see that R2 keeps being the same one, this means that in order to predict the number of shares in the media, as they depend on many variables, we will need to use really advance tools.

Multiple regression:

linFit1 <- lm(log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_02 + LDA_03*self_reference_min_shares, data=dataTrain) ; summary(linFit1)
## 
## Call:
## lm(formula = log(shares) ~ kw_avg_avg + self_reference_avg_sharess + 
##     (self_reference_max_shares) + LDA_02 + LDA_03 * self_reference_min_shares, 
##     data = dataTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5150 -0.3328 -0.0134  0.3614  1.2431 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       7.032e+00  1.159e-02 606.735   <2e-16 ***
## kw_avg_avg                        5.726e-05  3.523e-06  16.255   <2e-16 ***
## self_reference_avg_sharess       -1.039e-06  8.633e-07  -1.204   0.2287    
## self_reference_max_shares         6.888e-07  3.466e-07   1.987   0.0469 *  
## LDA_02                           -2.508e-01  1.341e-02 -18.708   <2e-16 ***
## LDA_03                           -1.650e-01  1.523e-02 -10.831   <2e-16 ***
## self_reference_min_shares         1.175e-06  6.211e-07   1.892   0.0586 .  
## LDA_03:self_reference_min_shares -7.287e-07  1.060e-06  -0.687   0.4918    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.521 on 20060 degrees of freedom
## Multiple R-squared:  0.03732,    Adjusted R-squared:  0.03699 
## F-statistic: 111.1 on 7 and 20060 DF,  p-value: < 2.2e-16
linFit2 <- lm(log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
                LDA_01 + LDA_02 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + title_sentiment_polarity + title_subjectivity +  n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares, data=dataTrain) ; summary(linFit2)
## 
## Call:
## lm(formula = log(shares) ~ kw_avg_avg + self_reference_avg_sharess + 
##     (self_reference_max_shares) + LDA_00 + LDA_01 + LDA_02 + 
##     timedelta + n_unique_tokens + num_hrefs + channel + weekday + 
##     num_videos + abs_title_sentiment_polarity + title_sentiment_polarity + 
##     title_subjectivity + n_tokens_title + LDA_04 + LDA_03 * self_reference_min_shares, 
##     data = dataTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3535 -0.3198 -0.0110  0.3493  1.3166 
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       6.900e+00  3.536e-02 195.133  < 2e-16 ***
## kw_avg_avg                        5.514e-05  3.547e-06  15.548  < 2e-16 ***
## self_reference_avg_sharess        5.070e-07  8.510e-07   0.596 0.551351    
## self_reference_max_shares        -5.912e-08  3.425e-07  -0.173 0.862964    
## LDA_00                            3.031e-01  1.959e-02  15.472  < 2e-16 ***
## LDA_01                           -4.001e-02  2.108e-02  -1.898 0.057689 .  
## LDA_02                           -2.354e-01  2.176e-02 -10.822  < 2e-16 ***
## timedelta                         3.542e-05  1.814e-05   1.953 0.050818 .  
## n_unique_tokens                  -2.148e-01  2.808e-02  -7.650  2.1e-14 ***
## num_hrefs                         3.752e-03  3.499e-04  10.725  < 2e-16 ***
## channel                           3.182e-02  2.678e-03  11.884  < 2e-16 ***
## weekday                          -1.466e-02  1.680e-03  -8.727  < 2e-16 ***
## num_videos                        3.766e-04  9.518e-04   0.396 0.692324    
## abs_title_sentiment_polarity     -1.865e-02  2.525e-02  -0.739 0.460060    
## title_sentiment_polarity          5.673e-02  1.539e-02   3.685 0.000229 ***
## title_subjectivity                4.505e-02  1.640e-02   2.747 0.006028 ** 
## n_tokens_title                   -1.266e-03  1.785e-03  -0.709 0.478364    
## LDA_04                            1.846e-01  1.892e-02   9.759  < 2e-16 ***
## LDA_03                                   NA         NA      NA       NA    
## self_reference_min_shares         3.296e-07  6.101e-07   0.540 0.589055    
## LDA_03:self_reference_min_shares -6.107e-07  1.042e-06  -0.586 0.557927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5103 on 20048 degrees of freedom
## Multiple R-squared:  0.07706,    Adjusted R-squared:  0.07618 
## F-statistic: 88.09 on 19 and 20048 DF,  p-value: < 2.2e-16
#With the star symbol, R is considering 3 columns, and using them. 

pr.multiple = exp(predict(linFit2, newdata=dataTest))
## Warning in predict.lm(linFit2, newdata = dataTest): prediction from a rank-
## deficient fit may be misleading
cor(dataTest$shares, pr.multiple)^2
## [1] 0.008220323

R2 is now 7.3%, what means that we have achieved to triple our prediction accuracy, our R^2, the problem is that it keeps been so low despite we have included lot of variables. We need to increase the models complexity.

I predicted just to consolidate the results, but this accuracy would be useless in any company.

Let´s try different models:

#model = log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_02 + LDA_03*self_reference_min_shares +  I(LDA_03^2) + I(self_reference_min_shares^2)

#model = (log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
#                 LDA_01 + LDA_02 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + I(title_sentiment_polarity^2) + title_subjectivity +  n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares + I(LDA_03^2) + I(self_reference_min_shares^2))
# 
# 
# model = (shares~  . - abs_title_sentiment_polarity)

model = (log(shares) ~ kw_avg_avg + self_reference_avg_sharess + (self_reference_max_shares) + LDA_00 +
                LDA_01 + LDA_02+ LDA_04 +timedelta + n_unique_tokens + num_hrefs + channel + weekday + num_videos + abs_title_sentiment_polarity + I(title_sentiment_polarity^2) + title_subjectivity +  n_tokens_title +LDA_04 + LDA_03*self_reference_min_shares + I(LDA_03^2) + I(self_reference_min_shares^2) + I(kw_avg_avg^2)+ I(self_reference_avg_sharess^2)+ I(LDA_00^2)+ I(LDA_01^2)+ I(LDA_02^2)+ I(LDA_04^2)+ I(timedelta^2)+ I(n_unique_tokens^2)+ I(num_hrefs^2)+ I(channel^2)+ I(weekday^2)+ I(num_videos^2) + I(abs_title_sentiment_polarity^2) )

#This last one is the model that has given me a higher performance


linFit <- lm(model, data=dataTrain); summary(linFit)
## 
## Call:
## lm(formula = model, data = dataTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4323 -0.3122 -0.0119  0.3408  1.4117 
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        6.484e+00  7.323e-02  88.546  < 2e-16 ***
## kw_avg_avg                         1.082e-04  6.342e-06  17.062  < 2e-16 ***
## self_reference_avg_sharess         4.332e-06  1.285e-06   3.372 0.000746 ***
## self_reference_max_shares         -4.522e-07  3.618e-07  -1.250 0.211419    
## LDA_00                             3.164e-01  8.509e-02   3.719 0.000201 ***
## LDA_01                            -1.502e-01  9.353e-02  -1.606 0.108353    
## LDA_02                             3.061e-02  8.347e-02   0.367 0.713801    
## LDA_04                             1.389e-01  8.052e-02   1.726 0.084451 .  
## timedelta                         -3.862e-04  7.082e-05  -5.453 5.01e-08 ***
## n_unique_tokens                    1.623e-01  9.453e-02   1.717 0.085912 .  
## num_hrefs                          3.221e-03  6.574e-04   4.899 9.71e-07 ***
## channel                            1.484e-01  1.309e-02  11.337  < 2e-16 ***
## weekday                            5.001e-02  8.752e-03   5.714 1.12e-08 ***
## num_videos                         3.145e-03  1.687e-03   1.864 0.062396 .  
## abs_title_sentiment_polarity      -8.432e-02  5.398e-02  -1.562 0.118310    
## I(title_sentiment_polarity^2)      1.123e-01  6.043e-02   1.858 0.063235 .  
## title_subjectivity                 5.048e-02  1.771e-02   2.851 0.004368 ** 
## n_tokens_title                     5.509e-04  1.781e-03   0.309 0.757154    
## LDA_03                                    NA         NA      NA       NA    
## self_reference_min_shares          5.177e-07  1.153e-06   0.449 0.653335    
## I(LDA_03^2)                       -8.733e-02  6.871e-02  -1.271 0.203726    
## I(self_reference_min_shares^2)     2.595e-12  2.528e-12   1.026 0.304823    
## I(kw_avg_avg^2)                   -4.231e-09  3.802e-10 -11.128  < 2e-16 ***
## I(self_reference_avg_sharess^2)   -8.904e-12  2.398e-12  -3.713 0.000206 ***
## I(LDA_00^2)                        5.731e-02  7.059e-02   0.812 0.416887    
## I(LDA_01^2)                        1.047e-01  7.697e-02   1.361 0.173578    
## I(LDA_02^2)                       -1.733e-01  6.454e-02  -2.685 0.007258 ** 
## I(LDA_04^2)                        5.080e-02  6.373e-02   0.797 0.425330    
## I(timedelta^2)                     6.114e-07  9.409e-08   6.497 8.37e-11 ***
## I(n_unique_tokens^2)              -4.428e-01  1.046e-01  -4.234 2.31e-05 ***
## I(num_hrefs^2)                    -7.851e-06  8.136e-06  -0.965 0.334549    
## I(channel^2)                      -1.526e-02  1.661e-03  -9.190  < 2e-16 ***
## I(weekday^2)                      -8.112e-03  1.074e-03  -7.556 4.32e-14 ***
## I(num_videos^2)                   -9.283e-05  4.319e-05  -2.149 0.031616 *  
## I(abs_title_sentiment_polarity^2)         NA         NA      NA       NA    
## LDA_03:self_reference_min_shares  -3.735e-06  1.129e-06  -3.307 0.000944 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5052 on 20034 degrees of freedom
## Multiple R-squared:  0.09619,    Adjusted R-squared:  0.0947 
## F-statistic: 64.61 on 33 and 20034 DF,  p-value: < 2.2e-16

We have squared two non-linear variables to check for improvements. As we are introducing tones of variables, we are going to have many betas and probably the noise is going to be also quite high, we are going to be predicting basically noise, but this is actually the best way to get a greater R^2 in this set. Seems reasonable. R2 in testing is similar to that in training

A benchmark

The benchmark is our reference point and for us, it is going to be the prediction using the mean number of shares. Our models so demonstrate an accuray higher than the one in here:

mean(dataTrain$shares)
## [1] 1401.567
benchFit <- lm(shares ~ 1, data=dataTrain)
predictions <- predict(benchFit, newdata=dataTest)
cor(dataTest$shares, predictions)^2
## Warning in cor(dataTest$shares, predictions): the standard deviation is zero
## [1] NA
RMSE = sqrt(mean((predictions - dataTest$shares)^2))
RMSE
## [1] 8966.834

We obtain that the root mean square error; ie, RMSE, is 11288, so all models should be better than this one, otherwise they will not be useful.

#Repeated Cross Validation with Caret

We are using now caret to try better outputs using cv:

ctrl <- trainControl(method = "repeatedcv", 
                     number = 5, repeats = 5, allowParallel = TRUE)

test_results <- data.frame(shares = log(dataTest$shares))
cl = startParallel();cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
lm_tune <- train(model, data = dataTrain, 
                 method = "lm", 
                 preProc=c('scale', 'center'),
                 trControl = ctrl); lm_tune
## Linear Regression 
## 
## 20068 samples
##    19 predictor
## 
## Pre-processing: scaled (35), centered (35) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 16055, 16054, 16054, 16056, 16053, 16055, ... 
## Resampling results:
## 
##   RMSE       Rsquared    MAE      
##   0.5059477  0.09217688  0.3904224
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
endParallel(cl)
## [1] "The model has finished."

By the moment, I am using cv, but as there is no hyper parameter yet, we do not really use it, despite this, we have increase the R2 up to 10%, not bad at all.

Overfitted linear regression

As we have many variables, this model should demonstrate some better performance:

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
alm_tune <- train(model, data = dataTrain, 
                  method = "lm", 
                  preProc=c('scale', 'center'),
                  trControl = ctrl)

endParallel(cl)
## [1] "The model has finished."
toc()
## 14.36 sec elapsed
test_results$alm <- predict(alm_tune, dataTest)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
postResample(pred = test_results$alm,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.97431799 0.07678933 0.66412598

The best model until now, had as much as 8% R^2 in the testing set and now we are getting 8.6% in the R2 despite it is so overfitted. This model will only work for this dataset, as in most of the cases. Overfitting is very dangerous, all Betas are very noisy but to compute predictions the model is not dangerous, to predict, the use of overfitting is not bad. You cannot understand why it is working.

Visualization

qplot(test_results$alm, test_results$shares) + 
  labs(title="Linear Regression Observed VS Predicted", x ="Predicted", y="Observed") +
  geom_abline(intercept = 0, slope = 1, colour = "green") +
  theme_bw()

We can see that as we knew, our model is definitely not linear. The variability is smaller and points are closer to the line. You can use this graph in any data set in the world.

Forward regression

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
for_tune = train(model, data = dataTrain, 
                  method = "leapForward", 
                  preProc=c('scale', 'center'),
                  tuneGrid = expand.grid(nvmax = 2:30),
                  trControl = ctrl)
## Warning in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : 2
## linear dependencies found
## Reordering variables and trying again:
endParallel(cl)
## [1] "The model has finished."
toc()
## 14.92 sec elapsed
for_tune
## Linear Regression with Forward Selection 
## 
## 20068 samples
##    19 predictor
## 
## Pre-processing: scaled (35), centered (35) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 16055, 16055, 16053, 16055, 16054, 16056, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE       Rsquared    MAE      
##    2     0.5262967  0.01733318  0.4112015
##    3     0.5215030  0.03559206  0.4068019
##    4     0.5207392  0.03839544  0.4060967
##    5     0.5200965  0.04077445  0.4054602
##    6     0.5188108  0.04543154  0.4041540
##    7     0.5185800  0.04627704  0.4039840
##    8     0.5182917  0.04732302  0.4037631
##    9     0.5173418  0.05075231  0.4025720
##   10     0.5169039  0.05234667  0.4020954
##   11     0.5158906  0.05607740  0.4010781
##   12     0.5154417  0.05767342  0.4005640
##   13     0.5149989  0.05929044  0.4001299
##   14     0.5144764  0.06121718  0.3992399
##   15     0.5140504  0.06282255  0.3989035
##   16     0.5124246  0.06878128  0.3972000
##   17     0.5115730  0.07186506  0.3963575
##   18     0.5101998  0.07680829  0.3952209
##   19     0.5093830  0.07970564  0.3944875
##   20     0.5090916  0.08074721  0.3942032
##   21     0.5085155  0.08280889  0.3936421
##   22     0.5080750  0.08441037  0.3931503
##   23     0.5078089  0.08535071  0.3928102
##   24     0.5076237  0.08601358  0.3925764
##   25     0.5075106  0.08642649  0.3924365
##   26     0.5073140  0.08714035  0.3922425
##   27     0.5070976  0.08790967  0.3919777
##   28     0.5069159  0.08855719  0.3917537
##   29     0.5067626  0.08913374  0.3915465
##   30     0.5062447  0.09096086  0.3909802
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was nvmax = 30.
plot(for_tune)

Consider that nvmax is the max number of variables you need to take, this is the hyper parameter, and we can see that in our case, the larger it is, the better.

#R is concluding that nvmax optimal is 30 . In the plot we can see what happens with other values by comparin with the error. This is expensive because uses cv and each time one block of train is removed and then it tests on the train. The RMSE error keeps been high, but we almost achieve 10% of variability explained.

Now we are going to focus on knowing and understanding which Betas are better, this is part of the estimators interpretation:

coef(for_tune$finalModel, for_tune$bestTune$nvmax)
##                       (Intercept)                        kw_avg_avg 
##                       7.114920882                       0.118728510 
##        self_reference_avg_sharess         self_reference_max_shares 
##                       0.079700798                      -0.014264076 
##                            LDA_00                            LDA_01 
##                       0.106874679                      -0.016789880 
##                            LDA_02                         timedelta 
##                       0.039128737                       0.012109364 
##                   n_unique_tokens                         num_hrefs 
##                       0.020439316                       0.030841639 
##                           channel                           weekday 
##                       0.333550177                       0.107389863 
##                        num_videos      abs_title_sentiment_polarity 
##                       0.010083346                      -0.018761285 
##     I(title_sentiment_polarity^2)                title_subjectivity 
##                       0.018295695                       0.016243771 
##         self_reference_min_shares    I(self_reference_min_shares^2) 
##                       0.015154254                       0.014217734 
##                   I(kw_avg_avg^2)   I(self_reference_avg_sharess^2) 
##                      -0.068449280                      -0.077485594 
##                       I(LDA_00^2)                       I(LDA_01^2) 
##                       0.001041659                       0.013611182 
##                       I(LDA_02^2)                       I(LDA_04^2) 
##                      -0.061533372                       0.063818694 
##              I(n_unique_tokens^2)                    I(num_hrefs^2) 
##                      -0.053994478                      -0.004032317 
##                      I(channel^2)                      I(weekday^2) 
##                      -0.280251579                      -0.141886490 
##                   I(num_videos^2)  LDA_03:self_reference_min_shares 
##                      -0.012366897                      -0.017832753 
## I(abs_title_sentiment_polarity^2) 
##                       0.000000000

Betas for 30 variables

Predict

test_results$frw <- predict(for_tune, dataTest)
postResample(pred = test_results$frw,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.97423601 0.07725199 0.66432016
qplot(test_results$frw, test_results$shares) + 
  labs(title="Forward Regression Observed VS Predicted", x="Predicted", y="Observed") +
  geom_abline(intercept = 0, slope = 1, colour = "green") +
  theme_bw()

As our estimation keeps been so low, we are not getting most of the points closer to this line, we will have more success on machine learning when we implement totally non-linear models.

Backward regression

Train

tic()
cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
back_tune <- train(model, data = dataTrain, 
                   method = "leapBackward", 
                   preProc=c('scale', 'center'),
                   tuneGrid = expand.grid(nvmax = 2:31),
                   trControl = ctrl)
## Warning in leaps.setup(x, y, wt = weights, nbest = nbest, nvmax = nvmax, : 2
## linear dependencies found
## Reordering variables and trying again:
back_tune
## Linear Regression with Backwards Selection 
## 
## 20068 samples
##    19 predictor
## 
## Pre-processing: scaled (35), centered (35) 
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 16055, 16054, 16054, 16055, 16054, 16055, ... 
## Resampling results across tuning parameters:
## 
##   nvmax  RMSE       Rsquared    MAE      
##    2     0.5275290  0.01292352  0.4130581
##    3     0.5244697  0.02446124  0.4098899
##    4     0.5212534  0.03615477  0.4060028
##    5     0.5156931  0.05677509  0.4005480
##    6     0.5154281  0.05775769  0.4002703
##    7     0.5147954  0.06003049  0.3996366
##    8     0.5144708  0.06123314  0.3993029
##    9     0.5138587  0.06349778  0.3987217
##   10     0.5136689  0.06420252  0.3984384
##   11     0.5132671  0.06570189  0.3981862
##   12     0.5129813  0.06672869  0.3978710
##   13     0.5124145  0.06884447  0.3972017
##   14     0.5122554  0.06943411  0.3968911
##   15     0.5106277  0.07530200  0.3954261
##   16     0.5098207  0.07821403  0.3948219
##   17     0.5096750  0.07876082  0.3946580
##   18     0.5094384  0.07956856  0.3945113
##   19     0.5093167  0.08002506  0.3943264
##   20     0.5091867  0.08046566  0.3942253
##   21     0.5087228  0.08208804  0.3937744
##   22     0.5086971  0.08220189  0.3937359
##   23     0.5083845  0.08328864  0.3933955
##   24     0.5081981  0.08396323  0.3932225
##   25     0.5080261  0.08456725  0.3930284
##   26     0.5078230  0.08530435  0.3928369
##   27     0.5076350  0.08598905  0.3926209
##   28     0.5071824  0.08767480  0.3920996
##   29     0.5068364  0.08892162  0.3916231
##   30     0.5065445  0.08995907  0.3912851
##   31     0.5060923  0.09162725  0.3906994
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was nvmax = 31.
plot(back_tune)

endParallel(cl); toc()
## [1] "The model has finished."
## 12.09 sec elapsed

As before, now we compute which are the chosen variables, this time we are going backwards, but the results are the same ones.

coef(back_tune$finalModel, back_tune$bestTune$nvmax)
##                       (Intercept)                        kw_avg_avg 
##                       7.114920882                       0.128585647 
##        self_reference_avg_sharess         self_reference_max_shares 
##                       0.099794266                      -0.020286416 
##                            LDA_00                            LDA_01 
##                       0.078606761                      -0.037637327 
##                            LDA_04                         timedelta 
##                       0.036390136                      -0.083690516 
##                   n_unique_tokens                         num_hrefs 
##                       0.021708235                       0.034101126 
##                           channel                           weekday 
##                       0.334213224                       0.107371741 
##                        num_videos      abs_title_sentiment_polarity 
##                       0.012414035                      -0.018257455 
##     I(title_sentiment_polarity^2)                title_subjectivity 
##                       0.018013943                       0.016098700 
##                       I(LDA_03^2)    I(self_reference_min_shares^2) 
##                      -0.025016886                       0.031459974 
##                   I(kw_avg_avg^2)   I(self_reference_avg_sharess^2) 
##                      -0.073769276                      -0.094588139 
##                       I(LDA_00^2)                       I(LDA_01^2) 
##                       0.011991181                       0.018771902 
##                       I(LDA_02^2)                       I(LDA_04^2) 
##                      -0.038724766                       0.011180526 
##                    I(timedelta^2)              I(n_unique_tokens^2) 
##                       0.099622595                      -0.054584948 
##                    I(num_hrefs^2)                      I(channel^2) 
##                      -0.006219745                      -0.284549630 
##                      I(weekday^2)                   I(num_videos^2) 
##                      -0.142027403                      -0.013895439 
##  LDA_03:self_reference_min_shares I(abs_title_sentiment_polarity^2) 
##                      -0.015911118                       0.000000000
test_results$bw <- predict(back_tune, dataTest)
postResample(pred = test_results$bw,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.97431290 0.07689517 0.66412417
qplot(test_results$bw, test_results$shares) + 
  labs(title="Backward Regression Observed VS Predicted", x="Predicted", y="Observed") +
  geom_abline(intercept = 0, slope = 1, colour = "blue") +
  theme_bw()

More or less same results, nothing so good, R2 keeps being between 8 and 10%, so not improvement.

Stepwise regression

tic(); cl = startParallel(); cl


step_tune <- train(model, data = dataTrain, 
                   method = "leapSeq", 
                   preProc=c('scale', 'center'),
                   tuneGrid = expand.grid(nvmax = 2:31),
                   trControl = ctrl)

endParallel(cl); toc()
plot(step_tune)

# which variables are selected?
coef(step_tune$finalModel, step_tune$bestTune$nvmax)

test_results$seq <- predict(step_tune, dataTest)
postResample(pred = test_results$seq,  obs = test_results$shares)

Ridge regression

library(elasticnet)
library(caret)

ridge_grid <- expand.grid(lambda = seq(0, .1, length = 100))

tic(); cl = startParallel(); cl

ridge_tune <- train(model, data = dataTrain,
                    method='ridge',
                    preProc=c('scale','center'),
                    tuneGrid = ridge_grid,
                    trControl=ctrl)

endParallel(cl); toc()

plot(ridge_tune)

ridge_tune$bestTune

test_results$ridge <- predict(ridge_tune, dataTest)

postResample(pred = test_results$ridge,  obs = test_results$shares)

In both, step wise and ridge regression, we keep getting more or less the same R2, despite this time it has taken a lot more of computational power. This is the output:

> postResample(pred = test_results$ridge,  obs = test_results$shares)
      RMSE   Rsquared        MAE 
0.97764666 0.08547507 0.66715508

The Lasso

ctrl <- trainControl(method = "repeatedcv", 
                     number = 5, repeats = 5, allowParallel = TRUE)
lasso_grid <- expand.grid(fraction = seq(.01, 1, length = 5))

tic(); cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
lasso_tune <- train(model, data = dataTrain,
                    method='lasso',
                    preProc=c('scale','center'),
                    tuneGrid = lasso_grid,
                    trControl=ctrl)

endParallel(cl); toc()
## [1] "The model has finished."
## 15.04 sec elapsed
plot(lasso_tune)

lasso_tune$bestTune
##   fraction
## 5        1
test_results$lasso <- predict(lasso_tune, dataTest)
postResample(pred = test_results$lasso,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.97431799 0.07678933 0.66412598

We can see Lasso is the same except the penalty that is that ridge uses the square and with this model we remove the square is like the absolute value. Best lambda here will be close again to 0, different shape but same R2. It is very difficult to improve results using linear formulas.

Elastic Net

This is the last model I am going to try, because as our linear model is quite bad, we are not gonna make many improvements.

modelLookup('glmnet')
##    model parameter                    label forReg forClass probModel
## 1 glmnet     alpha        Mixing Percentage   TRUE     TRUE      TRUE
## 2 glmnet    lambda Regularization Parameter   TRUE     TRUE      TRUE
tic(); cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
elastic_grid = expand.grid(alpha = seq(0, .2, 0.01), lambda = seq(0, .1, 0.01))

glmnet_tune <- train(model, data = dataTrain,
                     method='glmnet',
                     preProc=c('scale','center'),
                     tuneGrid = elastic_grid,
                     trControl=ctrl)

endParallel(cl); toc()
## [1] "The model has finished."
## 52.24 sec elapsed
plot(glmnet_tune)

glmnet_tune$bestTune
##     alpha lambda
## 144  0.13      0
test_results$glmnet <- predict(glmnet_tune, dataTest)

postResample(pred = test_results$glmnet,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.97441221 0.07672888 0.66424899

Now, This grid is in 2D, one parameter for regression and other for lassu. It is more expensive, takes a bit more but is fast yet. We havent improve the results from the others models yet. Same R2

Machine Learning tools

K-Nearest-Neighbours - kNN

We apply this tools exaclty equall than in classification. Knn is not very good but in this case for houses it is going to perform really good.

modelLookup('kknn')
##   model parameter           label forReg forClass probModel
## 1  kknn      kmax Max. #Neighbors   TRUE     TRUE      TRUE
## 2  kknn  distance        Distance   TRUE     TRUE      TRUE
## 3  kknn    kernel          Kernel   TRUE     TRUE      TRUE
tic(); cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
knn_tune <- train(model, 
                  data = dataTrain,
                  method = "kknn",   
                  preProc=c('scale','center'),
                  tuneGrid = data.frame(kmax=c(11,13,15,19,21),distance=2,kernel='optimal'),
                  trControl = ctrl)

endParallel(cl); toc()
## [1] "The model has finished."
## 374.99 sec elapsed
plot(knn_tune)

test_results$knn <- predict(knn_tune, dataTest)

postResample(pred = test_results$knn,  obs = test_results$shares)
##       RMSE   Rsquared        MAE 
## 0.98677646 0.05842808 0.67342264

#As each post does not belong to any other one, we are not getting any good results. The R2 has decrease to 6%, so no improvements have been done.

Random Forests

tic(); cl = startParallel(); cl
## [1] "The model has started processing. Wait until it finishes."
## socket cluster with 7 nodes on host 'localhost'
rf_tune <- train(model, 
                 data = dataTrain,
                 method = "rf",
                 preProc=c('scale','center'),
                 trControl = ctrl,
                 ntree = 100,
                 tuneGrid = data.frame(mtry=c(1,3,5,7)),
                 importance = TRUE)
endParallel(cl); toc() 
## [1] "The model has finished."
## 767.92 sec elapsed
plot(rf_tune)

test_results$rf <- predict(rf_tune, dataTest)

postResample(pred = test_results$rf,  obs = test_results$shares)
##      RMSE  Rsquared       MAE 
## 0.9579239 0.1164964 0.6473019

RMSE Rsquared MAE

0.9633765 0.1137545 0.6497545

For the first time, I am glad about getting R2 of 11%, this is because random forests use very complex trees.

Variable importance

plot(varImp(rf_tune, scale = F), scales = list(y = list(cex = .95)))

AS before, LDA_03 is the most important variable, follow closely by time delta, LDA_03 with no square and num_hrefs. That are the number of reference links that a post has and time delta, that is the time between the publication of the post and the data acquisition, so, the highest times, pases, the number of shares will be also higher.

Gradient Boosting

tic(); cl = startParallel(); cl
xgb_tune <- train(model, 
                  data = dataTrain,
                  method = "xgbTree",
                  preProc=c('scale','center'),
                  objective="reg:squarederror",
                  trControl = ctrl,
                  tuneGrid = expand.grid(nrounds = c(100,200), max_depth = c(1,2,3), eta = c(0.01, 0.1, 1),
                                         gamma = c(1, 2, 3) subsample = c(0.1,0.2,0.3)))
tic(); cl = startParallel(); cl

test_results$xgb <- predict(xgb_tune, dataTest)

postResample(pred = test_results$xgb,  obs = test_results$shares)

The only difficulty is that this is so expensive, need to select very well the grid, in each data set, the numbers changes completely, they depend heavily of each. Is a very good tool but is very expensive for the computer.

For my case is around 10% R2 that is not better than forest but if we use 2 days of gradient boosting it will give us a higher %.

As, before, we are going to use ensemble so that we select the best model approach and get a higher accuracy. I must admit that I am quite frustrated because of the fact that I have not found a good model for the regression approach. Classification was okay, but for this set, as the most correlated variable is just about 8%, it is hard to be capable of predicting well.

Ensemble

Let’s summarize the MAE for all the tools

apply(test_results[-1], 2, function(x) mean(abs(x - test_results$shares)))
##       alm       frw        bw     lasso    glmnet       knn        rf 
## 0.6641260 0.6643202 0.6641242 0.6641260 0.6642490 0.6734226 0.6473019
test_results$comb = (test_results$alm + test_results$knn + test_results$rf)/3

postResample(pred = test_results$comb,  obs = test_results$shares)
##      RMSE  Rsquared       MAE 
## 0.9673855 0.1095884 0.6547216
 RMSE  Rsquared       MAE 

0.9716005 0.1115928 0.6565718

We get a good R2 for what we have seen until now, that is up to 11%

Final predictions

It is important the predictions but need to know the prediction intervals to detect opportunities and be capable of advertising more or less a post in a determined place and moment that could get more shares in a cheap / expensive way. If we use a linear model, the interval is given to 95%CI automatically but in machine learning there are no formulas, so we apply common sense:

yhat = exp(test_results$comb)

head(yhat) 
## [1] 1008.2539 1103.0055 1251.3116 1249.3492  938.6784 1260.0269
hist(yhat, col="darkgreen")

This are the residuals of the test, let´s define the prediction intervals:

Prediction Intervals

The errors are more symmetric

y = exp(test_results$shares)
error = y-yhat
hist(error, col="darkgreen")

Let’s use the first 200 posts in testing to compute the noise size

noise = error[1:200]

Prediction intervals: let’s fix a 90% confidence

lwr = yhat[201:length(yhat)] + quantile(noise,0.05, na.rm=T) #5%, lower part of the error. 
upr = yhat[201:length(yhat)] + quantile(noise,0.95, na.rm=T)

Then we need to sum the prediction. It is yhat +- the quantiles. In the 5% we sum but it is like the minus because the number is going to be negative. We use a part of the testing set to compute the quantiles and then with the intervals we use the second part, this is the final testing set.

Performance using the last prices in yhat:

predictions = data.frame(real=y[201:length(y)], fit=yhat[201:length(yhat)], lwr=lwr, upr=upr)

predictions = predictions %>% mutate(out=factor(if_else(real<lwr | real>upr,1,0)))

mean(predictions$out==1)
## [1] 0.1492907
ggplot(predictions, aes(x=fit, y=real))+
  geom_point(aes(color=out)) + theme(legend.position="none") + ylim(0,  40000)+
  geom_ribbon(data=predictions,aes(ymin=lwr,ymax=upr),alpha=0.3) +
  labs(title = "Prediction intervals", x = "prediction",y="Real Number Of Shares")
## Warning: Removed 84 rows containing missing values (geom_point).

In red post that are inside the interval, and outside there are some posts, they are called opportunities. For posts above you need to advertise them because they will give you a lot of publicity and shares and for posts below you should not invest advertising a lot in them since they will not get much likes. You can increase or decrease the width by changing the %, 90% uses to be good, it depends on the company. This is the only way to compute CI using machine learning.

Final conclusions:

So, as we have been saying, the variability of the models explained in regression have been pretty low.

I have not been capable of getting more than 11% of accuracy, what seems to be a disaster. It is true that for some datasets the accuracy cannot be computed easily, I took this data from the page that I posted because classfication and regression could be performed. I think that probably there is a not so difficult way to find a model that fits properly in this data set.

By other side, as the correlation between variables is so poor, up to 8% the most, I think that is the reason why is that hard to predict something here. I have tried to check reducing the number of variables but the correlation obviously is the same and the results are the same ones too, so I do not know exactly what more to do.

Moreover, as time is limited, I have used parallel processing to compute and run the script but despite it, I have not used pretty much large grids, If I did, i think i would get a better correlation but not so high as in classification.

In binary classifications (conclusions about that are above just before regression), I have been capable of achieving a great result as it is so easy to predict if the post will have or not a number of shares greater than a threshold and classify it depending on that.

I hope you enjoyed the work and please, if there is a way to improve this poor regression accuracy by changing the model, I will like to know it!